In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score
from sklearn.metrics import *
from scipy.stats import randint
from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

In [3]:
classifiers = [
    LogisticRegression(),
    GaussianNB(),
    KNeighborsClassifier(3),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    BalancedRandomForestClassifier(),
    RUSBoostClassifier()
    ]



In [None]:
start = 2020

for year in range(start, 2022):
    year = str(year)
    df = pd.read_csv('years/'+year+'.csv')
    df2 = pd.read_csv('random_sample/'+year+'.csv')
    artist_id = pd.read_csv('artist_weeks_data.csv')
    if 'df3' not in globals():
        df3 = df.append(df2)
    else:
        df3 = df3.append(df)
        df3 = df3.append(df2)


In [None]:
keys = list(artist_id['artist'])
values = list(artist_id['artist_id'])
artist_id = dict(zip(keys, values))

In [None]:
artist = []
arr = df3['spotify_artist'].to_numpy().astype(str)
arr = np.char.rstrip(arr, "']")
arr = np.char.lstrip(arr, "['")
arr = np.char.split(arr, "', '")
for i, a_list in enumerate(arr):
    if a_list[0] in artist_id:
        artist.append(artist_id[a_list[0]])
    else:
        artist_id[a_list[0]] = len(artist_id) + 1
        artist.append(artist_id[a_list[0]])
df3['spotify_id'] = artist

In [None]:
Xy = df3.iloc[:,6:].to_numpy()
X = Xy[:, :-2]
y = Xy[:,-1]
label = y.copy()
label[y <= 0] = 0
label[y > 0] = 1
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
max_score = 0
max_test = ""
for clf in classifiers:
    print("="*30)
    name = clf.__class__.__name__
    print(name)
    score = cross_validate(clf, X, y, scoring=['f1', 'precision', 'recall', 'accuracy', 'roc_auc'], n_jobs = -1, cv = 10)

    print("Accuracy: {:.4%}".format(np.mean(score['test_accuracy'])))
    print("F1: {:.4%}".format(np.mean(score['test_f1'])))
    print("Recall: {:.4%}".format(np.mean(score['test_recall'])))
    print("Precision: {:.4%}".format(np.mean(score['test_precision'])))
    print("AUC: {:.4%}".format(np.mean(score['test_roc_auc'])))

    if np.mean(score['test_f1']) > max_score:
        max_score = np.mean(score['test_roc_auc'])
        max_test = clf.__class__.__name__
    
print("="*30)

In [13]:
ml_df = pd.DataFrame(columns = ['accuracy', 'recall', 'precision'])
f_importances = []
for i in range(1980, 2022):
    year = i


    year = str(year)
    df = pd.read_csv('csv_files/years/'+year+'.csv')
    df2 = pd.read_csv('csv_files/random_sample/'+year+'.csv')
    artist_id = pd.read_csv('csv_files/artist_id.csv')
    df3 = df.append(df2)

    keys = list(artist_id['artist'])
    values = list(artist_id['artist_id'])
    artist_id = dict(zip(keys, values))

    artist = []
    arr = df3['spotify_artist'].to_numpy().astype(str)
    arr = np.char.rstrip(arr, "']")
    arr = np.char.lstrip(arr, "['")
    arr = np.char.split(arr, "', '")
    for i, a_list in enumerate(arr):
        if a_list[0] in artist_id:
            artist.append(artist_id[a_list[0]])
        else:
            artist_id[a_list[0]] = len(artist_id) + 1
            artist.append(artist_id[a_list[0]])
    df3['spotify_id'] = artist

    Xy = df3.iloc[:,6:].to_numpy()
    X = Xy[:, :-2]
    y = Xy[:,-1]
    label = y.copy()
    label[y <= 0] = 0
    label[y > 0] = 1
    y = label
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    score = cross_validate(clf, X, y, scoring=['f1', 'precision', 'recall', 'accuracy', 'roc_auc'], n_jobs = -1, cv = 10)
    scores = [score['test_accuracy'], score['test_recall'], score['test_precision']]
    scores = np.mean(scores, axis = 1)
    ml_df = ml_df.append(pd.DataFrame(list(scores.reshape(1,3)), columns = ['accuracy', 'recall', 'precision']))
    f_importances.append(clf.feature_importances_)
    # pickle.dump(clf, open('ml_models/' + year + '.pkl', 'wb'))

In [15]:
pd.DataFrame(f_importances)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.562258,0.042053,0.039802,0.016702,0.034593,0.003699,0.033659,0.061694,0.044147,0.037957,0.032074,0.032602,0.05876
1,0.401008,0.07098,0.055335,0.02311,0.052666,0.005388,0.057247,0.068145,0.066355,0.046484,0.040283,0.047687,0.065311
2,0.356993,0.06509,0.050011,0.02336,0.05738,0.007292,0.052101,0.093704,0.074765,0.060352,0.043783,0.049226,0.065943
3,0.292794,0.096447,0.068251,0.02612,0.059468,0.006293,0.050817,0.117444,0.074357,0.045997,0.0402,0.046687,0.075126
4,0.251259,0.076205,0.070162,0.024743,0.064171,0.005399,0.086289,0.137497,0.065839,0.036873,0.042716,0.048069,0.090777
5,0.297178,0.066272,0.067885,0.028813,0.067455,0.00466,0.066587,0.104026,0.05813,0.046692,0.044982,0.039832,0.107487
6,0.297082,0.073138,0.064867,0.024634,0.057155,0.00409,0.060145,0.098532,0.055949,0.050831,0.049571,0.047656,0.116349
7,0.277675,0.077671,0.062035,0.022557,0.067303,0.005389,0.043586,0.105059,0.06583,0.049322,0.050591,0.049339,0.123645
8,0.260838,0.084589,0.073907,0.023127,0.064286,0.005084,0.061053,0.118202,0.045281,0.065486,0.042299,0.057619,0.098228
9,0.304701,0.056216,0.06755,0.023932,0.063149,0.006103,0.055488,0.119478,0.048376,0.055479,0.045587,0.056568,0.097373
