<a href="https://colab.research.google.com/github/rghosh1353/spotify_genre_proj/blob/cchen03/Check_in_3_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%pip install scikit-lego



In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.base import clone
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error, recall_score, roc_auc_score, roc_curve, r2_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklego.linear_model import LADRegression

In [7]:
df = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")

In [8]:
# Data Cleaning Function
def clean_data(df):
  clean_df = df.copy()
  clean_df = clean_df.drop('Unnamed: 0', axis=1, errors='ignore')
  clean_df = clean_df.dropna()
  clean_df['explicit'] = clean_df['explicit'].astype(int)
  clean_df = clean_df[['track_id', 'artists', 'album_name', 'track_name', 'energy','explicit']]
  return clean_df

# Apply the function to the DataFrame
spotify_clean = clean_data(df)


In [9]:
# Split data into train and test
train_spotify, val_spotify = train_test_split(spotify_clean, test_size=0.2, random_state=42)
train_spotify

Unnamed: 0,track_id,artists,album_name,track_name,energy,explicit
96253,2dBjh7rBHfTtKVIDY89g5G,Seu Jorge,"Musicas para Churrasco, Vol.1 (Ao Vivo) (Delux...",Carolina (Ao Vivo),0.880,0
70417,3d09lKFNMjL28k0B0TcQhW,Chyi Chin,"""1"" (壹)",大約在冬季,0.361,0
66688,1IGSLXykccmp1TSqhYnden,Babyboomboom,English and French,"Heads, Shoulders, Knees and Toes (Tête, Epaule...",0.225,0
51391,67KTXqxMEyBMq3LApqCgNV,Sidhu Moose Wala;DIVINE,Moosetape,Moosedrilla (feat. DIVINE),0.829,0
95123,2Qx9yW1HKfX04jXSAbteiK,Rumbavana,Pa Que Lo Goces Con Ganas,El Capitolio,0.702,0
...,...,...,...,...,...,...
76821,0xmnMoo0YxOxd14xtquPsb,Phil Coulter,Country Serenity,You Raise Me Up,0.190,0
110269,13GV28kgnu0P8IrYfrgwIc,Sajanka,Time of India,Lord Shiva,0.924,0
103695,3cHu1UqgGTMPLiGrWDQHVC,Frankie Valli & The Four Seasons,Christmas Music - Holiday Hits,I Saw Mommy Kissing Santa Claus,0.560,0
860,2VVWWwQ3FiWnmbukTb6Kd3,The Mayries,I Will Wait,I Will Wait,0.107,0


# Stage 5: Logistic Regression

We're using


*   Predictor X: Energy
*   Response y: Explicit



In [10]:
# Compute a Logistic Regression fit for the training dataset.
lr_train = LogisticRegression(solver='liblinear', class_weight="balanced")
lr_train.fit(X=train_spotify[['energy']], y=train_spotify['explicit'].values)

# View the model intercept and coefficients
lr_train.intercept_, lr_train.coef_


(array([-1.190757]), array([[1.75317216]]))

In [11]:
val_spotify.sample(10)

Unnamed: 0,track_id,artists,album_name,track_name,energy,explicit
15442,4DTfDKKSDkUeTWrTdmbSWe,Sebastian Kamae;Mindr,Eye Of The Storm,Eye Of The Storm,0.191,0
68222,37o1JVuF7NnYWKYzLCe32r,Jhayco;Bryant Myers,Halloween 2022 Perreo Vol. 5,Deseos,0.726,0
30953,6GSkLS2y4ZnnZsuRAj975K,Marshmello;Roddy Ricch,Project Dreams,Project Dreams,0.828,1
19373,6O62mcdQBTU4AwVwLtGAHF,Justin Moore,Sad Country Songs,On The Rocks,0.586,0
59738,7vE0JifxEUBQ91qaUDC6es,Sabke Morde,PeeranSal,Peere Borna,0.905,0
32385,0BOivei4EFBvk7CKwEwaZf,Jan Blomqvist,Disconnected,The Space In Between,0.569,0
29211,0h0nV0TANrNwoJxLnCILoj,NURKO;Dia Frampton,Faith,Faith,0.608,0
26443,2G6jCIHp9Y3s4Q3b0XVk1Q,Randy Newman,La Princesa y el sapo (Banda Sonora Original e...,Cuento de hadas/Camino a casa,0.0927,0
82111,5EugXICXL06waHgP4iTI58,The Lemonheads,It's A Shame About Ray (Expanded Edition),Mrs. Robinson - Remastered,0.936,0
7020,5llrpUzc5oZ4U2RLruiDeI,Yonder Mountain String Band,"Mountain Tracks, Vol. 3",Holding,0.702,0


In [12]:
sample_index = [18726, 28097, 29054, 32086, 32670, 32366,
                33555, 42819, 47008, 51460, 59311, 60383,
                60877, 61000, 61024, 68154, 70999, 71790,
                72713, 89458, 90417, 93829, 104531, 113186,]  # Sample indices with both 0 and 1 for explicit
spotify_val_sample = val_spotify.loc[sample_index,:]  # Subset the validation data using the sample indices

# Proceed with the prediction
pred_spotify_sample = pd.DataFrame(dict(
    explicit = spotify_val_sample['explicit'],
    lr_predict = lr_train.predict_proba(spotify_val_sample[['energy']])[:, 1],  # Probabilities for 'explicit' class (1)
    lr_predict_binary = lr_train.predict(spotify_val_sample[['energy']])  # Binary predictions (0 or 1)
))

# Display the observed vs predicted table
pred_spotify_sample


Unnamed: 0,explicit,lr_predict,lr_predict_binary
18726,1,0.584133,1
28097,0,0.595162,1
29054,1,0.630909,1
32086,0,0.48589,0
32670,0,0.489394,0
32366,0,0.628047,1
33555,1,0.471891,0
42819,0,0.635794,1
47008,1,0.617338,1
51460,0,0.440154,0


In [14]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(spotify_val_sample['explicit'], pred_spotify_sample['lr_predict_binary'])
percentage = accuracy * 100

print(f"Percentage of correct predictions: {percentage:.2f}%")

Percentage of correct predictions: 62.50%


In [15]:
# Create Confusion Matrix
conf_lr = confusion_matrix(y_true=pred_spotify_sample['explicit'], y_pred=pred_spotify_sample['lr_predict_binary'])
conf_lr

array([[7, 7],
       [2, 8]])

In [16]:
# Prediction Accuracy Assessment
# Both of the lines below should print the same thing
(conf_lr[0,0] + conf_lr[1,1]) / conf_lr.sum()
accuracy_score(y_true=pred_spotify_sample['explicit'], y_pred=pred_spotify_sample['lr_predict_binary'])

0.625

In [17]:
# Sensitivity (TPR)
# Both of the lines below should print the same thing
conf_lr[1,1] / conf_lr[1,:].sum()
recall_score(y_true=pred_spotify_sample['explicit'], y_pred=pred_spotify_sample['lr_predict_binary'])

0.8

In [18]:
# Specificity (TNR)
# Both of the lines below should print the same thing
conf_lr[0,0] / conf_lr[0,:].sum()
recall_score(y_true=pred_spotify_sample['explicit'], y_pred=pred_spotify_sample['lr_predict_binary'], pos_label=0)

0.5

In [19]:
# Predicted Probability Densities
# Using a histogram because there are so few samples
px.histogram(pred_spotify_sample, x='lr_predict', color='explicit', nbins=24, opacity=0.5, barmode='overlay',)

In [20]:
# ROC Curve
lr_fpr_sample, lr_tpr_sample, lr_thresholds_sample = roc_curve(y_true=pred_spotify_sample['explicit'], y_score=pred_spotify_sample['lr_predict'])
lr_thresholds_sample
roc_lr_sample = pd.DataFrame({
    'False Positive Rate': lr_fpr_sample,
    'True Positive Rate': lr_tpr_sample,
    'Model': 'Logistic Regression'
}, index=lr_thresholds_sample)
roc_sample_df = pd.concat([roc_lr_sample])
px.line(roc_sample_df, x='False Positive Rate', y='True Positive Rate', color='Model', width=700, height=500)

In [21]:
# Area Under the Curve (under the ROC curve) Score
lr_auc_sample = roc_auc_score(y_true=pred_spotify_sample['explicit'], y_score=pred_spotify_sample['lr_predict'])
print("Logistic Regression AUC: ", lr_auc_sample.round(3))

Logistic Regression AUC:  0.586


In [22]:
# Cross Validation for Model Performance
X = val_spotify[['energy']]
y = val_spotify['explicit']
cross_val_score(lr_train, X, y, cv=5, scoring='roc_auc')

skfolds = StratifiedKFold(n_splits=5)
i = 1
for train_index, test_index in skfolds.split(X, y):
    clone_lr = clone(lr_train)
    X_train_folds = X.iloc[train_index]
    y_train_folds = y.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    print(test_index)
    clone_lr.fit(X_train_folds, y_train_folds)
    y_pred = clone_lr.predict(X_test_fold)

    auc_sample = roc_auc_score(y_true=y.iloc[test_index], y_score=y_pred)
    print("Fold: ", i, "\nAUC: ", auc_sample, '\nAccuracy: ', accuracy_score(y_true=y.iloc[test_index], y_pred=y_pred))
    i += 1

[   0    1    2 ... 4619 4632 4641]
Fold:  1 
AUC:  0.5508551374827322 
Accuracy:  0.5063596491228071
[4550 4551 4552 ... 9273 9287 9298]
Fold:  2 
AUC:  0.5732787916690595 
Accuracy:  0.512280701754386
[ 9109  9110  9111 ... 13677 13678 13679]
Fold:  3 
AUC:  0.5305878648162721 
Accuracy:  0.5021929824561403
[13680 13681 13682 ... 18237 18248 18272]
Fold:  4 
AUC:  0.5462842649780796 
Accuracy:  0.5133771929824561
[18238 18239 18240 ... 22797 22798 22799]
Fold:  5 
AUC:  0.5354066985645932 
Accuracy:  0.512280701754386
