In [86]:
###################################################
##          Linear Regression                    ##
#        with dummies variables                   #
###################################################
import warnings
warnings.filterwarnings('ignore')
#We will be working with a music dataset for both classification and regression problems. 
#First, we will build a regression model using all features in the dataset to predict song popularity. 
#There is one categorical feature, genre, with ten possible values.

import pandas as pd
import numpy as np

music_df = pd.read_csv("music_clean.csv")
music_df = music_df.drop("Unnamed: 0", axis=1)

#include categorical features in the model building process 
#can enhance performance as they may add information that contributes to prediction accuracy.

display(music_df.head(5))
print(music_df.shape)

n=1
genre1=["Alternative", " Anime", "Blues", "Classical", "Country", "Electronic", "Hip-Hop", "Jazz", "Rap", "Rock"] 
music_df["genre"][0] = genre1[0]
for i in np.arange(1,len(music_df)):
    if (i%10 == 0): n=0
    music_df["genre"][i] = genre1[n]
    n=n+1
display(music_df.head(5))
    

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1
1,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,1
2,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,1
3,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,1
4,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,1


(1000, 12)


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,Alternative
1,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,Anime
2,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,Blues
3,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,Classical
4,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,Country


In [89]:
############################################
##    Creating dummy variables            
############################################
#Create a new DataFrame containing the original columns of music_df 
#plus dummy variables from the "genre" column.

# Create music_dummies
music_dummies = pd.get_dummies(music_df, drop_first=True)

# Print the new DataFrame's shape
print("Shape of music_dummies: {}".format(music_dummies.shape))
display(music_dummies)

#As there were ten values in the "genre" column, 
#nine new columns were added by a call of pd.get_dummies() using drop_first=True. 
#After dropping the original "genre" column, there are still eight new columns in the DataFrame!


Shape of music_dummies: (1000, 20)


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre_Alternative,genre_Blues,genre_Classical,genre_Country,genre_Electronic,genre_Hip-Hop,genre_Jazz,genre_Rap,genre_Rock
0,60.0,0.896000,0.726,214547.0,0.1770,0.000002,0.1160,-14.824,0.0353,92.934,0.6180,1,0,0,0,0,0,0,0,0
1,63.0,0.003840,0.635,190448.0,0.9080,0.083400,0.2390,-4.795,0.0563,110.012,0.6370,0,0,0,0,0,0,0,0,0
2,59.0,0.000075,0.352,456320.0,0.9560,0.020300,0.1250,-3.634,0.1490,122.897,0.2280,0,1,0,0,0,0,0,0,0
3,54.0,0.945000,0.488,352280.0,0.3260,0.015700,0.1190,-12.020,0.0328,106.063,0.3230,0,0,1,0,0,0,0,0,0
4,55.0,0.245000,0.667,273693.0,0.6470,0.000297,0.0633,-7.787,0.0487,143.995,0.3000,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,57.0,0.972000,0.193,208040.0,0.0329,0.929000,0.0978,-28.228,0.0460,82.165,0.0366,0,0,0,0,1,0,0,0,0
996,56.0,0.005790,0.939,144453.0,0.3730,0.000000,0.2740,-7.779,0.2270,119.953,0.0602,0,0,0,0,0,1,0,0,0
997,54.0,0.016100,0.739,238339.0,0.5390,0.000000,0.2350,-9.735,0.3370,85.082,0.8350,0,0,0,0,0,0,1,0,0
998,62.0,0.326000,0.515,286707.0,0.5050,0.000000,0.1020,-5.606,0.0294,150.063,0.5380,0,0,0,0,0,0,0,1,0


In [90]:
############################################
# Regression with categorical features
############################################
#music_dummies, contains binary features for each song's genre, 
#now we'll build a ridge regression model to predict song popularity.

from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold

# Create X and y
X = music_dummies.drop("popularity", axis=1).values
y = music_dummies["popularity"].values

kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Instantiate a ridge model
ridge = Ridge(alpha=0.2)

# Perform cross-validation
scores = cross_val_score(ridge, X, y, cv=kf, scoring="neg_mean_squared_error")

# Calculate RMSE
rmse = np.sqrt(-scores)
print("Average RMSE: {}".format(np.mean(rmse)))
print("Standard Deviation of the target array: {}".format(np.std(y)))

#The model has been be evaluated by calculating the average RMSE, 
#The scores has been convert for each fold to positive values and take their square root. 
#This metric shows the average error of our model's predictions, 
#so it can be compared against the standard deviation of the target value—"popularity".

#An average RMSE of approximately 12.56 is lower than the standard deviation of the target variable 
#(song popularity), suggesting the model is reasonably accurate.



Average RMSE: 12.566091796793692
Standard Deviation of the target array: 14.02156909907019
