In [None]:
#Final project ----- Pier Luca Anania

#libraries
import pandas as pd
import seaborn as sns
import plotly.express as xp
import plotly.graph_objects as go
import numpy as np
from datetime import datetime
import missingno
import yaml
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV,ShuffleSplit
from sklearn.manifold import TSNE
from sklearn.linear_model import RidgeClassifier
from sklearn.impute import SimpleImputer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

palette = ['#000000',"#13e600","#fc0000"]
sns.palplot(palette)

#import data
test = pd.read_csv('../input/my-musical-preferences-data/test.csv')
train = pd.read_csv('../input/my-musical-preferences-data/train.csv')
description = yaml.load(open("../input/description/Description.yaml",'r'),Loader=yaml.FullLoader)

music = pd.concat([train,test]).reset_index(drop=True)
tr_mask = ~music.Category.isna()
music.head()

In [None]:
#visualize type of data
train.info()
print('_'*40)
test.info()
#object ~ string
music.info()

In [None]:
#other visualization of number of non null values in dataset music
missingno.bar(music, color=palette, figsize=(30,2))

**Data Preparation**

In [None]:
#Code to split in one hot  (nxn) with 0-1 elements
def split_to_onehot(df, col):
    """
    This method converts features separated by '|' into one-hot vectors.
    Additionally it drops unnecessary values, which present only in 
    test set / train set or have only one value.
    """
    # Getting all unique ganres values.
    unique = []
    for i in df.index:
        unique.extend(df.loc[i,col].split("|"))
    if "" in unique:
        unique.remove("")
    unique = list(set(unique))
    
    # Putting values into binary form 
    onehot = df.loc[:,["Category"]]
    onehot[unique] = np.zeros((len(unique),), dtype = np.int8)
    for i in df.index:
        g = set(df.loc[i,col].split("|"))
        for j in g:
            if j!="":
                onehot.loc[i,j] = 1

                              
    # Dropping unnecessary values            
    _a = onehot.groupby("Category").sum()
    only_one = list(_a.sum()[_a.sum()==1].index)
    only_train = list(_a.loc["none"][_a.loc["none"]==0].index)
    only_test = list(_a.loc[["like",'dislike']].sum()[_a.loc[["like",'dislike']].sum()==0].index)
    _a = set(only_one + only_train + only_test)
    onehot = onehot.drop(_a, axis=1)
    
    return onehot

**Preprocessing data**

In [None]:
#goal is to get the information in the categories column: 0-dislike, 1-like
print(music['Category'].unique())

music["Category"] = music["Category"].fillna("none").replace({0:"dislike",1:"like"})

music['Category'].unique()

In [None]:
#as seen in graph above there are 129 non null object
print(music['Version'])
print(music['Version'].count())

In [None]:
#replacing NaN with 'NA' object/string 
music["Version"] = music["Version"].fillna("NA")

label_encoder = LabelEncoder()
music.Version = label_encoder.fit_transform(music.Version)
music['Version'].unique()
music['Version'].unique()

In [None]:
#Album_type has 212 non null oblect, replace NaN with 'NA'  
music["Album_type"] = music["Album_type"].fillna("NA")
label_encoder = LabelEncoder()
music.Album_type = label_encoder.fit_transform(music.Album_type)
music['Album_type'].unique()
print(music['Album_type'].count())


In [None]:
#for Key using one hot I create a matrix of 0 and 1 if the note exists or not in the song
music[list(set(music["Key"].values))] = OneHotEncoder().fit_transform(music[["Key"]]).toarray()
label_encoder = LabelEncoder()
music.Key = label_encoder.fit_transform(music.Key)
music = music.drop("Key", axis=1)
music.columns
music #to check


In [None]:
#Replace Vocal with numerical value 0,1,2,3,4 
music['Vocal '] = music['Vocal '].fillna("NAN")  #nan into string before substitution
music.loc[:,'Vocal '] = music.loc[:,'Vocal '].replace({"M": 1, 'F': 2, 'F|M': 3, "N": 4,"NAN":0})
music['Vocal '].unique()  #to check

In [None]:
#Energy,Happiness,Dancebility,BPM
#from music.info() I know that first 3 are float64 and BPM is int64
for k in ["Energy","Happiness","Dancebility","BPM"]:
    print(f"{k}:{description[k]}")
    
music.loc[:,['Energy','Happiness','Dancebility','BPM']].info()

In [None]:
#The SimpleImputer class provides basic strategies for imputing missing values
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer()
dataclean = pd.DataFrame(imputer.fit_transform(music[['Energy', 'Happiness', 'Dancebility','BPM']]))
music[['Energy', 'Happiness', 'Dancebility','BPM']] = music[['Energy', 'Happiness', 'Dancebility','BPM']].fillna(0)
#before the 3 below had 964 non null object, I use Boolean operator to check
print(music['Energy'].count() == 965 )
print(music['Happiness'].count() == 965 )
print(music['Dancebility'].count() == 965 )

In [None]:
#Artists Genres, I do not have missing values but I must clean elements
music.loc[:, 'Artists_Genres']

In [None]:
#as before I create a matrix with 0-1 with one hot for Artist_Genres
matrix = split_to_onehot(music, 'Artists_Genres')
matrix= matrix.drop("Category", axis=1)

music = pd.concat([music,matrix],axis=1)
music = music.drop("Artists_Genres", axis=1)

music.head()  #first 5 rows to check

In [None]:
#Encode target labels with value between 0 and n_classes-1.

track_encoder = LabelEncoder()
music["Track"] = track_encoder.fit_transform(music["Track"])

country_encoder = LabelEncoder()
music.Country = music.Country.fillna("NA")
music["Country"] = country_encoder.fit_transform(music["Country"])

artists_encoder = LabelEncoder()
music.Artists = music.Artists.fillna("NA")
music["Artists"] = artists_encoder.fit_transform(music["Artists"])

album_encoder = LabelEncoder()
music.Album = music.Album.fillna("NA")
music["Album"] = album_encoder.fit_transform(music["Album"])

label_encoder = LabelEncoder()
music.Labels = music.Labels.fillna("NA")
music["Labels"] = label_encoder.fit_transform(music["Labels"])

music     #all elements (not Category) are numerical values


**Model Selection**

In [None]:
#Classifier using Ridge regression.
#This classifier first converts the target values into {-1, 1} and then 
#treats the problem as a regression task (multi-output regression in the multiclass case).

x, y = music.loc[tr_mask].iloc[:,2:], music.loc[tr_mask,"Category"]
deploy = music.loc[~tr_mask].iloc[:,2:]

rc = RidgeClassifier()
grid = {"alpha" : [0.007,0.008,0.0085],"solver": ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
cv = ShuffleSplit(n_splits=10,random_state=0)   #Random permutation cross-validator, with 10 reshuffling and splitting iteration and 
clf = GridSearchCV(rc, grid, cv=cv)              # random state = 0 Controls the randomness of the training and testing indices produced. Pass an int for reproducible output across multiple function calls
clf.fit(x,y)                                   #GridSearchCV implements a “fit” and a “score” method. It also implements “score_samples”, “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

clf.best_estimator_
sample = pd.read_csv('../input/mymusicalpreferencesdata/sample_submition.csv')
sample["Category"] = clf.predict(deploy)
sample["Category"] = (sample["Category"]=="like").astype(int)
sample.to_csv("deploy.csv", index=False)

#Score of the model Rideg Classifier
rc.fit(x, y)
score = rc.score(x, y)
print('Score Rc:' , score)

#Score of  cross-validation training method to the model and check the training score.
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rc, x, y, cv=cv)
print("CV average score: %.2f" % cv_scores.mean())

final = pd.read_csv('./deploy.csv')
final