In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Make data frames from data
genres = pd.read_csv('../input/dataset-of-songs-in-spotify/genres_v2.csv')
playlists=pd.read_csv('../input/dataset-of-songs-in-spotify/playlists.csv')

In [None]:
##This function was pulled from offline to let me see where any NA values to figure out how I'd deal with them
def assess_NA(data):
    """
    Returns a pandas dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are noted on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)
    
    # concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])
    
    # drop rows that don't have any missing data; omit if you want to keep all rows
    df_NA = df_NA[ (df_NA.T != 0).any() ]
    
    return df_NA

In [None]:
assess_NA(genres)
#Looking at where NAs are

Omit 3 columns with 50% NAN values. Separate Predictors from class (genre)

In [None]:
om = ['title','Unnamed: 0','song_name','genre'] #Columns to omit from predictors, based on having too many NAs to be useful

X = genres.drop(om, axis = 1) #create dataframe of predictors
y = genres['genre'].to_frame() #Creating target class data frame

In [None]:
from sklearn import preprocessing

In [None]:
gen_trans = preprocessing.OrdinalEncoder().fit(y) #Encoder to turn classes to integers

In [None]:
trans_y = pd.DataFrame(gen_trans.transform(y)) #Turn classes to integers

In [None]:
X.columns #Letting myself see what columns are in predictors

Checking for diversity of columns. If the values in each row are all the same, or all different then it's not good for predictions. will omit homogeous or overly unique columns.

In [None]:
div_check = {} #Checking for diversity of columns. If the values in each row are all the same, or all different then it's not good for predictions. will omit homogeous or overly unique columns.
for col in X:
    l = len(set(X[col]))
    div_check[col] =(l, l/42305)
    

In [None]:
div_check #Looking at diversity

In [None]:
#Finding columns to omit, and which columns are categorical
om = []
cat=[] 

for key in div_check:
    if div_check[key][1] > .5 or div_check[key][0] == 1:
        
        om.append(key)
    elif div_check[key][0]<50:
        cat.append(key)

In [None]:
#Omitting columns marked for omission
X_clean = X.drop(om, axis = 1)

In [None]:
#Separating categorical and continuous predictors for different preprocessing
categ = X_clean[cat]
contin = X_clean.drop(cat,axis=1)

In [None]:
#Fitting scaler to continuous predictors
scaler = preprocessing.StandardScaler().fit(contin)

In [None]:
#Scaling continuous predictors
contin_scaled = pd.DataFrame(scaler.transform(contin),columns=contin.columns)

In [None]:
#Fitting one hot encoder to categorical predictors
oh_enc = preprocessing.OneHotEncoder().fit(categ)


In [None]:
#Getting names of categorical columns, to put back into encoded dataframe
oh_names = oh_enc.get_feature_names()

In [None]:
#One hot encoding categorical predictors
cat_encoded = pd.DataFrame(oh_enc.transform(categ).toarray(),columns = oh_names)

In [None]:
#Fitting scaler to one hot encoded categorical predictors
cat_scale = preprocessing.StandardScaler().fit(cat_encoded)

In [None]:
#Scaling encoded categorical predictors
cat_enc_scale = pd.DataFrame(cat_scale.transform(cat_encoded),columns = oh_names)

In [None]:
#Putting categorical and continuous predictors back together, now that they're processed
a=[cat_enc_scale,contin_scaled]
X_clean_tran = pd.concat(a,axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Split data into test and training
X_train, X_test, y_train, y_test = train_test_split(X_clean_tran,trans_y, test_size= .2, random_state = 345)

In [None]:
#Just looking at training data
X_train

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#Fit a logistic regression to the training data
clf = LogisticRegression(random_state=987).fit(X_train,y_train.to_numpy().ravel())

In [None]:
#Predict test data based on logistic fit
y_hat_clf = pd.DataFrame(clf.predict(X_test))

In [None]:
#Reset indices for easy concatenation
y_hat_clf.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [None]:
#Reversing encoding of test labels and prediction, in case I want to use them later
y_act = pd.DataFrame(gen_trans.inverse_transform(y_test))
y_clf = pd.DataFrame(gen_trans.inverse_transform(y_hat_clf))

In [None]:
#Concatenate logistic prediction and actual labels, naming them appropriately
results_df = pd.concat([y_hat_clf,y_test],axis=1)
results_df.columns = ['clf', 'actual']

In [None]:
#Use crosstab to make a confusion matrix myself
conf_mat = pd.crosstab(results_df['clf'],results_df['actual'],rownames=['Predicted'],colnames=['Actual'])

In [None]:
#Looking at confusion matrix
conf_mat

In [None]:
import plotly.express as px

In [None]:
#A heat map to help visualize results. Bright spots should appear along diagonals, with dark spots everywhere else. This would indicate Perfect categorization. 
#Issue with this is that places with higher raw number of values show up brighter , even if they may have lower percentage of accuracy. I'll scale it to account for this.
px.imshow(conf_mat)

In [None]:
#Doing previously mentioned scaling
scaled_conf_mat = pd.DataFrame(preprocessing.StandardScaler().fit_transform(conf_mat))

In [None]:
#Scaled heat map. Bright spots indicate the most frequent classified thing in each column
#It's now clear that the most easily classified are class 8 and up. Class 1 is also fairly easily classified, but a lot of things between 0 and 6 are falsely classified as class 7. 
#7 is mostly classified truly positively as well.
scale_log_map = px.imshow(scaled_conf_mat)

In [None]:
scale_log_map

In [None]:
#A list of the class labels corresponding to their encoded numbers
pd.DataFrame(gen_trans.inverse_transform(pd.DataFrame(range(15))))

In [None]:
#Calculate percentages that were predicted accurately for every class
correct_class = list(np.diag(conf_mat))
actual_sums = list(conf_mat.sum(axis=0))
corr_per = pd.DataFrame(np.divide(correct_class,actual_sums)).transpose()

In [None]:
#View confusion matrix with fraction correct tacked onto the bottom
pd.concat([conf_mat,corr_per],axis=0)

Categories 6 through 14 are most easily classifiable. Corresponding to:
6	Trap Metal
7	Underground Rap
8	dnb
9	hardstyle
10	psytrance
11	techhouse
12	techno
13	trance
14	trap


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
#Just seeing that I can also do confusion matrix with sklearn metrics, but I already named the axes on the other one
pd.DataFrame(confusion_matrix(y_clf,y_act))

In [None]:
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import balanced_accuracy_score

In [None]:
#Calculated for use in ROC score
y_score = clf.predict_proba(X_test)

In [None]:
#Balanced accuracy calculated for comparison
balanced_accuracy_score(y_test,y_hat_clf)

In [None]:
#ROC_AUC score calculated. This is a great score, well classified
roc_auc_score(y_test,y_score,multi_class = 'ovr')

In [None]:
from sklearn.decomposition import PCA

In [None]:
#PCA analysis is attempted to see if it'll be possible to visually display the categories on a scatterplot
pca = PCA().fit(X_clean_tran)

In [None]:
#Seeing how much variance is explained by each principal component
pca_rat = list(pca.explained_variance_ratio_)

In [None]:
import itertools

In [None]:
#Getting a cumiulative version of the previous variance explainability
cum_rat = list(itertools.accumulate(pca_rat))

In [None]:
#Visualizing that the first couple of principal components will not be enough to scatterplot easily distinguishable classes. Not enough variability will be captured in them. First 3 principal components
#only account for 22% of variability. Things will still be jumbled together. It would require 15 values to show comfortably and there's no easy way to show a 15 dimensional graph
px.bar(cum_rat)

This shows that it requires 15 of the PCA components to explain 80% of the variance, so I won't be able to clearly visualize the distinction by plotting a few PCA values

In [None]:
#Boosting
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#Running a boosing fit for comparison with logistic regression. Chose boosting over random forest, because maybe the better models will be able to differentiate pop somehow.
boost_fit = GradientBoostingClassifier(max_depth=4,max_features = 'sqrt',n_estimators = 500).fit(X_train,np.ravel(y_train))

In [None]:
#For use in roc auc score
boost_score=boost_fit.predict_proba(X_test)

In [None]:
#Predicting scores, to show using confusion matrix and heat map
y_hat_boost = pd.DataFrame(boost_fit.predict(X_test))

In [None]:
#This roc score is even better than the logistic regression one.
roc_auc_score(y_test,boost_score,multi_class='ovr')

In [None]:
#Adding to results dataframe for comparison of models
results_df = pd.concat([y_hat_boost,results_df],axis=1)


In [None]:
#Name column appropriately
results_df.rename(columns={0:'boost'},inplace=True)

In [None]:
#Make confusion matrix for boosting model
boost_conf_mat = pd.crosstab(results_df['boost'],results_df['actual'],rownames=['Predicted'],colnames=['Actual'])

In [None]:
#Look at confusion matrix
boost_conf_mat

In [None]:
correct_class = list(np.diag(boost_conf_mat))
actual_sums = list(boost_conf_mat.sum(axis=0))
boost_corr_per = pd.DataFrame(np.divide(correct_class,actual_sums)).transpose()

In [None]:
#A confusion matrox for the boosting with percent correct for each column tacked onto bottom
pd.concat([boost_conf_mat,boost_corr_per],axis=0)

In [None]:
#The percent correct from each column from the logistic regression, for comparison.
#It can be seen that almost universally the boosting scores are better. The only thing thats' worse is the class 7 score.
#The class 8 on the boosting is almost perfect
corr_per

In [None]:
#Unscaled heat map.
px.imshow(boost_conf_mat)

In [None]:
#Scale heat map
scaled_boost_con =pd.DataFrame(preprocessing.StandardScaler().fit_transform(boost_conf_mat))

In [None]:
#Scaled heat map. Like before bright spots should be along diagonal
scale_boost_map = px.imshow(scaled_boost_con)

In [None]:
#Showing scaled boosting heat map
scale_boost_map

In [None]:
#Reshowing the heatmap from the logistic regression. While it's somewhat hard to see,the diagonals are a bit brigther, and the upper and lower triangles are a bit darker on the
#boosting heat map. There is more contrast. This is indicative of the higher accuracy of the boosting model
px.imshow(scaled_conf_mat)

In [None]:
results_df.to_csv('genre_predictions.csv')