In [1]:
pip install scikit-lego

Collecting scikit-lego
  Downloading scikit_lego-0.9.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting narwhals>=1.2.0 (from scikit-lego)
  Downloading narwhals-1.12.1-py3-none-any.whl.metadata (7.2 kB)
Downloading scikit_lego-0.9.2-py2.py3-none-any.whl (217 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m217.7/217.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading narwhals-1.12.1-py3-none-any.whl (195 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.1/195.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: narwhals, scikit-lego
Successfully installed narwhals-1.12.1 scikit-lego-0.9.2


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.base import clone
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error, recall_score, roc_auc_score, roc_curve, r2_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Applying Cleaning Function

1. We're choosing 'explicit' as our binary categorical response variable.

In [5]:
# Data Cleaning Function
def clean_data(df):
  clean_df = df.copy()
  clean_df = clean_df.drop('Unnamed: 0', axis=1, errors='ignore')
  clean_df = clean_df.dropna()

  #Encode explicit column (binary encoding)
  clean_df['explicit'] = clean_df['explicit'].astype(int)

  #Remove track_id, artists,album_name, track_name
  clean_df = clean_df.drop(['track_id', 'artists', 'album_name', 'track_name','track_genre'], axis=1)

  return clean_df

# Apply the function to the DataFrame
spotify_clean = clean_data(df)

In [6]:
spotify_clean.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4
1,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4
2,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4
3,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3
4,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4


In [7]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(spotify_clean.drop('explicit', axis=1), spotify_clean['explicit'], test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((91199, 14), (22800, 14), (91199,), (22800,))

In [8]:
y_train.head()

Unnamed: 0,explicit
96253,0
70417,0
66688,0
51391,0
95123,0


In [9]:
#Class imbalance in y_train
y_train.value_counts()

Unnamed: 0_level_0,count
explicit,Unnamed: 1_level_1
0,83348
1,7851


In [10]:
7851/(7851+83348)*100 #Really small percentage of explicit (True)

8.60864702463843

## Stage 5: KNN or Random Forest Algorithm
1. For the data set you have chosen and the binary categorical response variable that you had chosen for Check-In 3 for a logistic regression or another binary variable in your data set, apply the KNN algorithm or Random Forest Algorithm for classification.(The variable chosen does not have to be directly related to the final goal of your project.)

In [12]:
# Train and Fit Model for KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

2. Calculate the confusion matrix, prediction accuracy, prediction error, true positive rate, and true negative rate, and F1 score on your training data set

In [13]:
# Confusion Matrix
y_pred_train = knn.predict(X_train)
cm_knn = confusion_matrix(y_train, y_pred_train)
cm_knn

array([[82699,   649],
       [ 5633,  2218]])

In [14]:
# Prediction Accuracy
accuracy = accuracy_score(y_train, y_pred_train)
print("Prediction Accuracy:", accuracy)

Prediction Accuracy: 0.931117665763879


In [15]:
# Prediction Error
error = (cm_knn[0,1] + cm_knn[1,0])/cm_knn.sum()
print("Prediction Error:", error)

Prediction Error: 0.06888233423612102


In [16]:
# True Positive Rate
tpr = recall_score(y_train, y_pred_train, pos_label=1)
print("True Positive Rate:", tpr)

True Positive Rate: 0.28251178193860654


In [None]:
# True Negative Rate


In [None]:
# F1 Score


3. Calculate and plot the ROC curve and AUC on your validation data set. Use 5-fold cross-validation on the validation set to calculate the AUC and accuracy of each fold.

In [None]:
# Calculating the ROC curve


In [None]:
# Plotting the ROC curve and AUC


In [None]:
# Using 5-fold cross-validation on the validation set (calculate the AUC and accuracy of each fold)