This notebook identifies which ml model should be used to train the data.


In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import *
from scipy.stats import randint
from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

classifiers = [
    LogisticRegression(),
    GaussianNB(),
    KNeighborsClassifier(3),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    BalancedRandomForestClassifier(),
    RUSBoostClassifier()
    ]

Choose which year you wish to test the model on

In [None]:
year = 2021

year = str(year)
df = pd.read_csv('../csv_files/years/'+year+'.csv')
df2 = pd.read_csv('../csv_files/random_sample/'+year+'.csv')
artist_id = pd.read_csv('../csv_files/artist_id.csv')
if 'df3' not in globals():
    df3 = df.append(df2)
else:
    df3 = df3.append(df)
    df3 = df3.append(df2)


In [None]:
keys = list(artist_id['artist'])
values = list(artist_id['artist_id'])
artist_id = dict(zip(keys, values))

In [None]:
artist = []
arr = df3['spotify_artist'].to_numpy().astype(str)
arr = np.char.rstrip(arr, "']")
arr = np.char.lstrip(arr, "['")
arr = np.char.split(arr, "', '")
for i, a_list in enumerate(arr):
    if a_list[0] in artist_id:
        artist.append(artist_id[a_list[0]])
    else:
        artist_id[a_list[0]] = len(artist_id) + 1
        artist.append(artist_id[a_list[0]])
df3['spotify_id'] = artist

In [None]:
Xy = df3.iloc[:,6:].to_numpy()
X = Xy[:, :-2]
y = Xy[:,-1]
label = y.copy()
label[y <= 0] = 0
label[y > 0] = 1
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

The following code prints the prediction scores of each of the models used for comparison in this notebook.

In [None]:
max_score = 0
max_test = ""
for clf in classifiers:
    print("="*30)
    name = clf.__class__.__name__
    print(name)
    score = cross_validate(clf, X, y, scoring=['f1', 'precision', 'recall', 'accuracy', 'roc_auc'], n_jobs = -1, cv = 10)

    print("Accuracy: {:.4%}".format(np.mean(score['test_accuracy'])))
    print("F1: {:.4%}".format(np.mean(score['test_f1'])))
    print("Recall: {:.4%}".format(np.mean(score['test_recall'])))
    print("Precision: {:.4%}".format(np.mean(score['test_precision'])))
    print("AUC: {:.4%}".format(np.mean(score['test_roc_auc'])))

    if np.mean(score['test_f1']) > max_score:
        max_score = np.mean(score['test_roc_auc'])
        max_test = clf.__class__.__name__
    
print("="*30)

Creates 
1. csv files that records the prediction scores of the models for each year.
2. Also creates a csv file for feature importances.
3. .pkl files for ml models

In [None]:
ml_df = pd.DataFrame(columns = ['accuracy', 'recall', 'precision'])
f_importances = []
for i in range(1980, 2022):
    year = i


    year = str(year)
    df = pd.read_csv('../csv_files/years/'+year+'.csv')
    df2 = pd.read_csv('../csv_files/random_sample/'+year+'.csv')
    artist_id = pd.read_csv('../csv_files/artist_id.csv')
    df3 = df.append(df2)

    keys = list(artist_id['artist'])
    values = list(artist_id['artist_id'])
    artist_id = dict(zip(keys, values))

    artist = []
    arr = df3['spotify_artist'].to_numpy().astype(str)
    arr = np.char.rstrip(arr, "']")
    arr = np.char.lstrip(arr, "['")
    arr = np.char.split(arr, "', '")
    for i, a_list in enumerate(arr):
        if a_list[0] in artist_id:
            artist.append(artist_id[a_list[0]])
        else:
            artist_id[a_list[0]] = len(artist_id) + 1
            artist.append(artist_id[a_list[0]])
    df3['spotify_id'] = artist

    Xy = df3.iloc[:,6:].to_numpy()
    X = Xy[:, :-2]
    y = Xy[:,-1]
    label = y.copy()
    label[y <= 0] = 0
    label[y > 0] = 1
    y = label
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    score = cross_validate(clf, X, y, scoring=['f1', 'precision', 'recall', 'accuracy', 'roc_auc'], n_jobs = -1, cv = 10)
    scores = [score['test_accuracy'], score['test_recall'], score['test_precision']]
    scores = np.mean(scores, axis = 1)
    ml_df = ml_df.append(pd.DataFrame(list(scores.reshape(1,3)), columns = ['accuracy', 'recall', 'precision']))
    f_importances.append(clf.feature_importances_)
    pickle.dump(clf, open('..ml_models/' + year + '.pkl', 'wb'))

df_importance = pd.DataFrame(f_importances, columns=['artist_id', 'danceability', 'energy', 'key' , 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'])
df_importance.to_csv('../csv_files/f_importances.csv')
ml_df.to_csv('../csv_files/ml_scores.csv')
