In [None]:
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


Preparing of dataset. In this step we make sure only New Music Friday and No Playlist songs stay in the dataset.

In [None]:
dfc = pd.read_pickle("dataframe_w_labels.pkl")

dfd_newmf = dfc[dfc['PlaylistTitle'] == 'New Music Friday']
dfd_nop = dfc[dfc['PlaylistTitle'] == 'No Playlist']


In [None]:
#shuffling the random sample
dfd_nop = shuffle(dfd_nop)

# getting same amount of data for each 291 New Music Friday Songs and 291 Random Songs
dfd_nop = dfd_nop.head(291)
len(dfd_nop)


In [None]:
combined_frames = [dfd_nop, dfd_newmf]
new_music_friday_or_none = pd.concat(combined_frames)

In [None]:
new_music_friday_or_none.reset_index(drop=True, inplace=True)


Defining of pipeline with necessary preprocessing steps.

In [None]:
pipeline = Pipeline([
    
    ('onehot_categories', ColumnTransformer([
        ("onehot", OneHotEncoder(), ["mode"]),
        ("normalize", MinMaxScaler(), ['danceability', 'energy', 'loudness','acousticness',
                                        'instrumentalness','valence', 'tempo', 'duration_ms', 'NoFeaturing']) 
    ], remainder="passthrough")),
        
    ('regression', SVC(kernel = 'poly', probability=True, gamma='scale', C=1.0, degree=3)),
])

Splitting of dataset to X_train, X_test, y_train and y_test.

In [None]:
X = new_music_friday_or_none[['danceability','mode', 'energy', 'loudness','acousticness',
    'instrumentalness','valence', 'tempo', 'duration_ms', 'NoFeaturing']] 
y =  new_music_friday_or_none['PlaylistTitle']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Fitting the train data to the model

In [None]:
pipeline.fit(X_train,y_train)


Calculating score for the model.

In [None]:
pipeline.score(X_test, y_test)

In [None]:
# Cross Validation Classification Report 
from sklearn import model_selection
from sklearn.metrics import classification_report

y_pred1 = pipeline.predict(X_test)
report = classification_report(y_test, y_pred1)
print(report)

In the following 2 blocks we create a dataframe where each song has a column with a percentage of being a part of each playlist. The last 2 columns are then the Classified playlist, which is a playlist that the model selects for a given song, and Max Value, which is the predicted probability of a song being a part of the predicted playlist.

In [None]:
playlists = np.unique(new_music_friday_or_none['PlaylistTitle'])
prob = pipeline.predict_proba(X_test)
pro_df = pd.DataFrame(data=prob, columns=playlists)

In [None]:
pro_df['Classified playlist']=pro_df.apply(lambda x:x[(x==x.max())].index.to_series().sample(frac=1).iloc[0], axis=1)
pro_df['Max value'] = pro_df.max(axis=1) * 100
pro_df.head(10)

In the block below we create a histogram, where we show the number of songs and their predicted percentage range.

In [None]:
plt.hist(pro_df['Max value'], bins = [0,20,40,60,80,100], edgecolor='k', color='dodgerblue')
plt.ylabel('Number of songs')
plt.xlabel('Percentage of a song being correctly classified')
# plt.savefig("Hist2.pdf")

Lastly, we create a dataframe where we compare the actual playlist that a song is on and the predicted playlist by the model.

In [None]:
y_test = pd.DataFrame(y_test)

y_test["Prediction"] = pipeline.predict(X_test)
pd.set_option("display.max_rows", None, "display.max_columns", None) # Displays the entire dataframe in nootebook, not just a preview

comparison_column = np.where(y_test["Prediction"] == y_test["PlaylistTitle"], True, False)
y_test["equal"] = comparison_column

y_test.head(10)