In [23]:
import pandas as pd
import numpy as np

In [6]:
from spotify.data_transformations import DataTransformations
from spotify.train_test_split import DatasetSplit

In [33]:
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score

In [8]:
df = pd.read_csv(
    "D:/Github/spotify-tracks-project/dataset/tracks.csv"
)  # Read dataframe

In [9]:
DF = (df
.pipe(DataTransformations.preprocess)
.pipe(DataTransformations.process)
)

In [10]:
DF.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
414876,0,279000,0,0.654,0.528,9,-5.674,1,0.0256,0.772,0.0,0.134,0.473,95.045,4
481510,0,160000,1,0.584,0.547,7,-8.502,0,0.382,0.655,2e-06,0.134,0.56,83.917,4
360011,0,223368,0,0.731,0.659,6,-4.018,1,0.145,0.176,0.00166,0.1,0.806,176.0,4
77287,0,214825,0,0.496,0.928,5,-9.056,0,0.0612,0.00237,0.864,0.145,0.0384,143.005,4
529772,0,330730,0,0.885,0.976,7,-7.868,1,0.0953,0.324,7e-06,0.0536,0.63,115.006,3


In [11]:
train_set = DatasetSplit.return_train(DF)
test_set = DatasetSplit.return_test(DF)
print("The lengths of the train and test sets are {} and {} respectively.".format(len(train_set), len(test_set)))

The lengths of the train and test sets are 84941 and 21236 respectively.


In [12]:
# X_train, X_test, y_train, t_test split
X_train = train_set.copy().drop(columns=['popularity'])
y_train = train_set['popularity']
X_test = test_set.copy().drop(columns=['popularity'])
y_test = test_set['popularity']


In [13]:
y_train_array = y_train.to_numpy

In [14]:
# Dummy Classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_train_pred = dummy_clf.predict(X_train)
print(dummy_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(dummy_train_pred, y_train)}.")

print(f"The recall score is {recall_score(dummy_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, dummy_train_pred)}.")

0.8908183327250679
The precision score is 0.0.
The recall score is 0.0.
The AUC score is 0.5.


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Decision Tree
decision_clf = tree.DecisionTreeClassifier()
decision_clf.fit(X_train, y_train)
decision_train_pred = decision_clf.predict(X_train)
print(decision_clf.score(X_train, y_train))

# precision, recall, roc score
print(f"The precision score is {precision_score(decision_train_pred, y_train)}.")
print(f"The recall score is {recall_score(decision_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, decision_train_pred)}.")



0.9935955545614015
The precision score is 0.9441449212853138.
The recall score is 0.9970393987702117.
The AUC score is 0.9719006552321081.


In [16]:
# Logistic Regression
logistic_clf = LogisticRegression(random_state=0)
logistic_clf.fit(X_train, y_train)
logistic_train_pred = logistic_clf.predict(X_train)
print(logistic_clf.score(X_train, y_train))

# precision, recall
print(f"The precision score is {precision_score(logistic_train_pred, y_train)}.")
print(f"The recall score is {recall_score(logistic_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, logistic_train_pred)}")


0.8908183327250679
The precision score is 0.0.
The recall score is 0.0.
The AUC score is 0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Random Forests
random_clf = RandomForestClassifier(max_depth=20, random_state=0, class_weight= "balanced")
random_clf.fit(X_train, y_train)
random_train_pred = random_clf.predict(X_train)
print(random_clf.score(X_train, y_train))

print(f"The precision score is {precision_score(random_train_pred, y_train)}.")
print(f"The recall score is {recall_score(random_train_pred, y_train)}.")
print(f"The AUC score is {roc_auc_score(y_train, random_train_pred)}")


0.96745976619065
The precision score is 0.986197972827259.
The recall score is 0.776268884739433.
The AUC score is 0.9756805609441381


Now we evaluate the models on the test set.

In [18]:
# Decision Tree
decision_test_pred = decision_clf.predict(X_test)
print(decision_clf.score(X_test, y_test))

# precision, recall, roc score
print(f"The precision score is {precision_score(decision_test_pred, y_test)}.")
print(f"The recall score is {recall_score(decision_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, decision_test_pred)}.")

0.8260030137502354
The precision score is 0.2393925859758821.
The recall score is 0.2120253164556962.
The AUC score is 0.5672669620409494.


In [19]:
# Logistic Regression
logistic_test_pred = logistic_clf.predict(X_test)
print(logistic_clf.score(X_test, y_test))

# precision, recall, roc score
print(f"The precision score is {precision_score(logistic_test_pred, y_test)}.")
print(f"The recall score is {recall_score(logistic_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, logistic_test_pred)}.") # Nothing suprising about the precision, recall and AUC scores.

0.8945658316067056
The precision score is 0.0.
The recall score is 0.0.
The AUC score is 0.5.


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Random Forests
random_test_pred = random_clf.predict(X_test)
print(random_clf.score(X_test, y_test))

# precision, recall, roc score
print(f"The precision score is {precision_score(random_test_pred, y_test)}.")
print(f"The recall score is {recall_score(random_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, random_test_pred)}.")

0.8628272744396308
The precision score is 0.21125502456453774.
The recall score is 0.2919753086419753.
The AUC score is 0.5754385350753415.


Let us now try out some baseline models that we can build from our exploratory data analysis.

In [21]:
# Baseline model 1 (time signature)

def predict_by_time_signature(dataframe : pd.DataFrame):
    """This model predicts using time_signature. See EDA for these numbers."""
    reindexed_dataset = dataframe.copy().reset_index().drop(columns = ['index'])
    predictions = np.zeros(dataframe.shape[0], dtype = int)
    for i in reindexed_dataset.index:
        if reindexed_dataset.loc[i, 'time_signature']== 0:
            predictions[i] = np.random.choice(2, 1, p = [1-0.154,0.154])[0] # predict 0 with prob 0.846 and 1 with 0.154
        elif reindexed_dataset.loc[i, 'time_signature']== 1:
            predictions[i] = np.random.choice(2, 1, p = [1-0.102,0.102])[0]
        elif reindexed_dataset.loc[i, 'time_signature']== 3:
            predictions[i] = np.random.choice(2, 1, p = [1-0.102,0.102])[0]
        elif reindexed_dataset.loc[i, 'time_signature']== 4:
            predictions[i] = np.random.choice(2, 1, p = [1-0.109,0.109])[0]
        else:
            predictions[i] = np.random.choice(2, 1, p = [1-0.114,0.114])[0]
    return predictions

In [32]:
time_signature_test_pred = predict_by_time_signature(X_test)

# precision, recall, roc score
print(f"The precision score is {precision_score(time_signature_test_pred, y_test)}.")
print(f"The recall score is {recall_score(time_signature_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, time_signature_test_pred)}.")


The precision score is 0.10897722197409558.
The recall score is 0.10664335664335664.
The AUC score is 0.5006906428868214.


In [26]:
# Baseline model (instrumentalness)

def predict_by_instrumentalness(dataframe : pd.DataFrame):
    """This model predicts using instrumentalness. See EDA for these numbers."""
    reindexed_dataset = dataframe.copy().reset_index().drop(columns = ['index'])
    predictions = np.zeros(dataframe.shape[0], dtype = int)
    for i in reindexed_dataset.index:
        if reindexed_dataset.loc[i, 'instrumentalness'] <= 2.21e-05:
            predictions[i] = np.random.choice(2, 1, p = [1-0.128,0.128])[0] 
        elif reindexed_dataset.loc[i, 'instrumentalness']>= 2.21e-05 and reindexed_dataset.loc[i, 'instrumentalness'] <=  0.00756 :
            predictions[i] = np.random.choice(2, 1, p = [1-0.105722,0.105722])[0]
        else:
            predictions[i] = np.random.choice(2, 1, p = [1-0.053214,0.053214])[0]
    return predictions


In [30]:
instrumentalness_test_pred = predict_by_instrumentalness(X_test)

# precision, recall, roc score
print(f"The precision score is {precision_score(instrumentalness_test_pred, y_test)}.")
print(f"The recall score is {recall_score(instrumentalness_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, instrumentalness_test_pred)}.")

The precision score is 0.1174631531933899.
The recall score is 0.11504811898512686.
The AUC score is 0.5054863273468133.


In [28]:
# Baseline model (danceability)

def predict_by_danceability(dataframe : pd.DataFrame):
    """This model predicts using danceability. See EDA for these numbers."""
    reindexed_dataset = dataframe.copy().reset_index().drop(columns = ['index'])
    predictions = np.zeros(dataframe.shape[0], dtype = int)
    for i in reindexed_dataset.index:
        if reindexed_dataset.loc[i, 'danceability'] <= 0.491:
            predictions[i] = np.random.choice(2, 1, p = [1-0.073480,0.073480])[0] # predict 0 with prob 0.846 and 1 with 0.154
        elif reindexed_dataset.loc[i, 'danceability']>= 0.491 and reindexed_dataset.loc[i, 'danceability'] <=  0.592 :
            predictions[i] = np.random.choice(2, 1, p = [1-0.095256,0.095256])[0]
        elif reindexed_dataset.loc[i, 'danceability']>= 0.592 and reindexed_dataset.loc[i, 'danceability'] <=  0.677 :
            predictions[i] = np.random.choice(2, 1, p = [1-0.108629,0.108629])[0]
        elif reindexed_dataset.loc[i, 'danceability']>= 0.677 and reindexed_dataset.loc[i, 'danceability'] <= 0.761 :
            predictions[i] = np.random.choice(2, 1, p = [1-0.125706,0.125706])[0]
        else:
            predictions[i] = np.random.choice(2, 1, p = [1-0.139471,0.139471])[0]
    return predictions


In [31]:
danceability_test_pred = predict_by_danceability(X_test)

# precision, recall, roc score
print(f"The precision score is {precision_score(danceability_test_pred, y_test)}.")
print(f"The recall score is {recall_score(danceability_test_pred, y_test)}.")
print(f"The AUC score is {roc_auc_score(y_test, danceability_test_pred)}.")

The precision score is 0.11701652523447968.
The recall score is 0.11506368028107158.
The AUC score is 0.5054735729293944.


Based on the precision and recall scores, and the AUC score, we should go with random forests. 

In [37]:
import pickle

In [44]:
filename = 'popular_or_not_model.pkl'
pickle.dump(random_clf, open(filename, 'wb'))

In [45]:
loaded_classifier = pickle.load(open(filename, 'rb')) # load saved model

In [47]:
loaded_classifier.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)