In [1]:
import pandas as pd

In [2]:
from spotify.data_transformations import DataTransformations
from spotify.train_test_split import DatasetSplit

In [3]:
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score

In [4]:
df = pd.read_csv(
    "/home/sabyasachi/git/spotify_tracks/dataset/tracks.csv"
)  # Read dataframe

In [5]:
DF = (df
.pipe(DataTransformations.preprocess)
.pipe(DataTransformations.process)
)

In [6]:
DF.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
279704,0,147840,0,0.733,0.859,9,-5.03,0,0.033,0.479,0.0645,0.105,0.52,124.948,4
93413,0,245227,0,0.497,0.73,1,-4.561,0,0.0573,0.591,1e-06,0.19,0.496,153.03,4
218976,0,257827,0,0.699,0.71,0,-5.525,0,0.0295,0.0588,0.0,0.134,0.557,102.06,4
470829,0,218205,0,0.786,0.628,4,-6.226,0,0.0694,0.0113,0.00551,0.0676,0.31,121.218,4
380728,0,228160,0,0.518,0.645,4,-5.579,0,0.0329,0.529,0.000814,0.177,0.664,164.039,4


In [7]:
train_set = DatasetSplit.return_train(DF)
test_set = DatasetSplit.return_test(DF)
print("The lengths of the train and test sets are {} and {} respectively.".format(len(train_set), len(test_set)))

The lengths of the train and test sets are 84941 and 21236 respectively.


In [8]:
# X_train, X_test, y_train, t_test split
X_train = train_set.copy().drop(columns=['popularity'])
y_train = train_set['popularity']
X_test = test_set.copy().drop(columns=['popularity'])
y_test = test_set['popularity']


In [17]:
y_train_array = y_train.to_numpy

In [21]:
# Dummy Classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_train_pred = dummy_clf.predict(X_train)
print(dummy_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(dummy_train_pred, y_train)}.")

print(f"The recall score is {recall_score(dummy_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, dummy_train_pred)}.")

0.8919720747342272
The precision score is 0.0.
The recall score is 0.0.
The AUC score is 0.5.


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Decision Tree
decision_clf = tree.DecisionTreeClassifier()
decision_clf.fit(X_train, y_train)
decision_train_pred = decision_clf.predict(X_train)
print(decision_clf.score(X_train, y_train))

# precision, recall, roc score
print(f"The precision score is {precision_score(decision_train_pred, y_train)}.")
print(f"The recall score is {recall_score(decision_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, decision_train_pred)}.")



0.9936426460719794
The precision score is 0.9439843068875327.
The recall score is 0.9970073664825047.
The AUC score is 0.9718205702589184.


In [23]:
# Logistic Regression
logistic_clf = LogisticRegression(random_state=0)
logistic_clf.fit(X_train, y_train)
logistic_train_pred = logistic_clf.predict(X_train)
print(logistic_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(logistic_train_pred, y_train)}.")
print(f"The recall score is {recall_score(logistic_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, logistic_train_pred)}")


0.8919720747342272
The precision score is 0.0.
The recall score is 0.0.
The AUC score is 0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Random Forests
random_clf = RandomForestClassifier(max_depth=20, random_state=0, class_weight= "balanced")
random_clf.fit(X_train, y_train)
random_train_pred = random_clf.predict(X_train)
print(random_clf.score(X_train, y_train))

print(f"The precision score is {precision_score(random_train_pred, y_train)}.")
print(f"The recall score is {recall_score(random_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, random_train_pred)}")


0.96882541999741
The precision score is 0.9845248474280732.
The recall score is 0.782842287694974.
The AUC score is 0.9757244444360059


Now we evaluate the models on the test set.

In [25]:
# Decision Tree
decision_test_pred = decision_clf.predict(X_test)
print(decision_clf.score(X_test, y_test))

# precision, recall, roc score
print(f"The precision score is {precision_score(decision_test_pred, y_test)}.")
print(f"The recall score is {recall_score(decision_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, decision_test_pred)}.")

0.8145601808250141
The precision score is 0.22421908429610612.
The recall score is 0.1978104945262363.
The AUC score is 0.5558896363329305.


In [26]:
# Logistic Regression
logistic_test_pred = logistic_clf.predict(X_test)
print(logistic_clf.score(X_test, y_test))

# precision, recall, roc score
print(f"The precision score is {precision_score(logistic_test_pred, y_test)}.")
print(f"The recall score is {recall_score(logistic_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, logistic_test_pred)}.") # Nothing suprising about the precision, recall and AUC scores.

0.889951026558674
The precision score is 0.0.
The recall score is 0.0.
The AUC score is 0.5.


  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# Random Forests
random_test_pred = random_clf.predict(X_test)
print(random_clf.score(X_test, y_test))

# precision, recall, roc score
print(f"The precision score is {precision_score(random_test_pred, y_test)}.")
print(f"The recall score is {recall_score(random_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, random_test_pred)}.")

0.8593426257298926
The precision score is 0.2173727000427899.
The recall score is 0.304921968787515.
The AUC score is 0.5780498084054364.


Based on the precision and recall scores, and the AUC score, we should go with random forests. 