# Music genres

## Introduction

"genre" has been converted to a binary feature where 1 indicates a rock song, and 0 represents other genres.

![music](https://thumbs.dreamstime.com/b/color-doodle-music-vector-hand-drawn-cartoon-icons-genres-theme-line-colored-icons-illustration-textil-paper-polygraphy-game-73528359.jpg)

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score, KFold, train_test_split

from sklearn.linear_model import Ridge

# Import modules
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.linear_model import Lasso

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV


In [12]:
music_df = pd.read_csv('music_clean.csv')
print(music_df.head(5))

   Unnamed: 0  popularity  acousticness  danceability  duration_ms  energy  \
0       36506        60.0      0.896000         0.726     214547.0   0.177   
1       37591        63.0      0.003840         0.635     190448.0   0.908   
2       37658        59.0      0.000075         0.352     456320.0   0.956   
3       36060        54.0      0.945000         0.488     352280.0   0.326   
4       35710        55.0      0.245000         0.667     273693.0   0.647   

   instrumentalness  liveness  loudness  speechiness    tempo  valence  genre  
0          0.000002    0.1160   -14.824       0.0353   92.934    0.618      1  
1          0.083400    0.2390    -4.795       0.0563  110.012    0.637      1  
2          0.020300    0.1250    -3.634       0.1490  122.897    0.228      1  
3          0.015700    0.1190   -12.020       0.0328  106.063    0.323      1  
4          0.000297    0.0633    -7.787       0.0487  143.995    0.300      1  


In [13]:
X = music_df.drop('genre', axis=1)
y = music_df[['genre']]

In [14]:

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Pipeline

In [15]:
# Instantiate an imputer
imputer = SimpleImputer()

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)

# Build steps for the pipeline
steps = [("imputer", imputer),
         ("knn", knn)]

In [16]:
steps = [("imputer", imputer),
        ("knn", knn)]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[89 11]
 [ 3 97]]


  return self._fit(X, y)


In [17]:
# Create X and y
X = music_df.drop('popularity', axis=1).values
y = music_df['popularity'].values

# Instantiate a ridge model
ridge = Ridge(alpha=0.2)

# Instantiate Kfold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(ridge, X, y, cv=kf, scoring="neg_mean_squared_error")

# Calculate RMSE
rmse = np.sqrt(-scores)
print("Average RMSE: {}".format(np.mean(rmse)))
print("Standard Deviation of the target array: {}".format(np.std(y)))

Average RMSE: 10.033098690539362
Standard Deviation of the target array: 14.02156909907019


An average RMSE of approximately 8.24 is lower than the standard deviation of the target variable (song popularity), suggesting the model is reasonably accurate.

In [18]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

X = music_df.drop('loudness', axis=1).values
y = music_df['loudness'].values

# Create pipeline steps
steps = [("scaler", StandardScaler()),
         ("lasso", Lasso(alpha=0.5))]

# Instantiate the pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

# Calculate and print R-squared
print(pipeline.score(X_test, y_test))

0.0


## Centering and scaling

In [23]:
# Build the steps
steps = [("scaler", StandardScaler()),
         ("logreg", LogisticRegression())]
pipeline = Pipeline(steps)

# Create the parameter space
parameters = {"logreg__C": np.linspace(0.001, 1.0, 20)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=21)

# Instantiate the grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

print(X_train)

# Fit to the training data
cv.fit(X_train, y_train)
print(cv.best_score_, "\n", cv.best_params_)

[[3.95290e+04 7.10000e+01 3.11000e-02 ... 1.03987e+02 7.28000e-01
  1.00000e+00]
 [3.89730e+04 6.40000e+01 6.64000e-01 ... 1.47694e+02 1.16000e-01
  1.00000e+00]
 [4.27270e+04 0.00000e+00 9.56000e-01 ... 1.36270e+02 5.50000e-02
  0.00000e+00]
 ...
 [4.99400e+04 4.80000e+01 1.98000e-01 ... 1.29538e+02 2.00000e-01
  0.00000e+00]
 [3.84300e+04 6.50000e+01 2.87000e-05 ... 1.11132e+02 4.38000e-01
  1.00000e+00]
 [1.57000e+03 5.90000e+01 4.08000e-01 ... 1.15034e+02 4.32000e-01
  0.00000e+00]]


ValueError: ignored

## Visualizing regression model performance

In [24]:
models = {"Linear Regression": LinearRegression(), "Ridge": Ridge(alpha=0.1), "Lasso": Lasso(alpha=0.1)}
results = []

# Loop through the models' values
for model in models.values():
  kf = KFold(n_splits=6, random_state=42, shuffle=True)

  # Perform cross-validation
  cv_scores = cross_val_score(model, X_train, y_train, cv=kf)

  # Append the results
  results.append(cv_scores)

# Create a box plot of the results
plt.boxplot(results, labels=models.keys())
plt.show()

NameError: ignored

## Predict

In [25]:
# Import mean_squared_error
from sklearn.metrics import mean_squared_error

for name, model in models.items():

  # Fit the model to the training data
  model.fit(X_train_scaled, y_train)

  # Make predictions on the test set
  y_pred = model.predict(X_test_scaled)

  # Calculate the test_rmse
  test_rmse = mean_squared_error(y_test, y_pred, squared=False)
  print("{} Test Set RMSE: {}".format(name, test_rmse))

NameError: ignored

## Visualizing classification model performance

In [26]:
# Create models dictionary
models = {"Logistic Regression": LogisticRegression(), "KNN": KNeighborsClassifier(), "Decision Tree Classifier": DecisionTreeClassifier()}
results = []

# Loop through the models' values
for model in models.values():

  # Instantiate a KFold object
  kf = KFold(n_splits=6, random_state=12, shuffle=True)

  # Perform cross-validation
  cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
  results.append(cv_results)
plt.boxplot(results, labels=models.keys())
plt.show()

NameError: ignored

In [27]:
# Create steps
steps = [("imp_mean", SimpleImputer()),
         ("scaler", StandardScaler()),
         ("logreg", LogisticRegression())]

# Set up pipeline
pipeline = Pipeline(steps)
params = {"logreg__solver": ["newton-cg", "saga", "lbfgs"],
         "logreg__C": np.linspace(0.001, 1.0, 10)}

# Create the GridSearchCV object
tuning = GridSearchCV(pipeline, param_grid=params)
tuning.fit(X_train, y_train)
y_pred = tuning.predict(X_test)

# Compute and print performance
print("Tuned Logistic Regression Parameters: {}, Accuracy: {}".format(tuning.best_params_, tuning.score(X_test, y_test) ))

ValueError: ignored