In [20]:
#######################################################
##  Centering and scaling for regression              #
##You will use a pipeline to preprocess the features  #
#and build a lasso regression model                   #
#to predict a song's loudness.                        #
#######################################################
#import warnings
#warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

music_df = pd.read_csv("music_clean.csv")
music_df = music_df.drop("Unnamed: 0", axis=1)

display(music_df.head(5))
print(music_df.shape)


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1
1,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,1
2,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,1
3,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,1
4,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,1


(1000, 12)


In [21]:

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X = music_df.drop("loudness",axis=1).values
y =  music_df["loudness"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=12)


# Create pipeline steps
steps = [("scaler", StandardScaler()),
         ("lasso", Lasso(alpha=0.5))]

# Instantiate the pipeline
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

# Calculate and print R-squared
print(pipeline.score(X_test, y_test))

###################
#Without scaling
###################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=12)

lasso_unscaled = Lasso(alpha=0.5).fit(X_train, y_train)

# Calculate and print R-squared
print(lasso_unscaled.score(X_test, y_test))


#The model may have only produced an R-squared of 0.738
#but without scaling this exact model would have only produced a score of 0.50, 
#which proves just how powerful scaling can be!


0.7382220445226466
0.5009047694916304


In [26]:
#######################################################
##  Centering and scaling for classification          
##you will bring together scaling and model building 
#into a pipeline for cross-validation.
#######################################################
# Build a pipeline to scale features in the dataset 
# and perform grid search cross-validation using a logistic regression model 
# with different values for the hyperparameter C. 
# The target variable is "genre", which contains binary values for rock as 1 and any other genre as 0.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

X = music_df.drop("genre",axis=1).values
y =  music_df["genre"].values

# Build the steps for the pipeline
steps = [("scaler", StandardScaler()),
         ("logreg", LogisticRegression())]
pipeline = Pipeline(steps)

# Create the parameter space
parameters = {"logreg__C": np.linspace(0.001, 1.0, 20)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=21)

# Instantiate the grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit to the training data
cv.fit(X_train, y_train)
print(cv.best_score_, "\n", cv.best_params_)

# Using a pipeline shows that a logistic regression model with "C" set to approximately 0.15 
# produces a model with 0.8625


0.8625 
 {'logreg__C': 0.15873684210526315}
