In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder,StandardScaler
from seaborn import countplot
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#Models to be used for experimentation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

#Put seed for same experiment outcomes
from numpy.random import seed
seed(1)

# 1. Load the data file

In [None]:
raw_df = pd.read_csv('../input/video-games-rating-by-esrb/Video_games_esrb_rating.csv',delimiter=',')
columns = raw_df.columns.values
print('Column names: ',columns)
print('-----\n')

esrb_rating_labels = raw_df.esrb_rating.unique()
print('Rating Labels: ',esrb_rating_labels)
print('-----\n')

descriptors = columns[1:-1]
print('Independent Variables: ',descriptors) 

In [None]:
X = raw_df[descriptors].to_numpy()

le = LabelEncoder()
y = le.fit_transform(raw_df['esrb_rating'].to_numpy())
y

# 2. Let's look at the correlations between the vairables

In [None]:
f = plt.figure(figsize=(10,10))
plt.matshow(raw_df[descriptors].corr(),fignum=f.number)
cb = plt.colorbar()
plt.title('Correlation Matrix')

In [None]:
print('We don\'t see much correlation between the variables other than themselves.' )
raw_df[descriptors].corr().style.background_gradient(cmap='coolwarm')

# 2. Let's look at the rating class distribution


In [None]:
print('It seems that the classes are almost balanced hence we won\'t try to up or down sample the classes')
countplot(raw_df['esrb_rating'])

# 3. Now we try to see if we can find any separation in the clusters for rating types when least components are used in PCA

In [None]:
pca = PCA(n_components=2)
pca_X = pca.fit_transform(X)

In [None]:
plt.scatter(pca_X[:,0],pca_X[:,1],c=y)
plt.title('PCA with n_components=2')
plt.xlabel('PCA-1')
plt.ylabel('PCA-2')
print('NOTE:- We don\'t see much separation between the classes of ratings when we try to use least number of components. Hence PCA is not suitable')

# 4. Before we did PCA with n_components=2 and couldn't find clusterings. Let's see why it happened

In [None]:
exp_var_ratio = []
components_list = []
for k in range(2,32):
    pca_k = PCA(n_components=k)
    pca_k.fit(X)
    exp_var_ratio.append(pca_k.explained_variance_ratio_)
    components_list.append(pca_k.components_)
    
exp_var_per = []
for v in exp_var_ratio:
    exp_var_per.append(np.sum(v))   
    

plt.plot(exp_var_per)
plt.title('Data distribution across number of components')
plt.xlabel('n_components')
plt.ylabel('Data ratio')
print('This suggest us that we have to use almost all the variables instead of few. About 90% of data is in 23-25 components')

In [None]:
"""
#Splitting the data into train and test sets
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

#Standardize the data first
sc = StandardScaler()
std_x_train = sc.fit_transform(x_train)
std_x_test = sc.transform(x_test)
"""

# 5. Let's experiment with several classifiers and see which one might do best

In [None]:
models = {'LR': LogisticRegression(),
          'SVM': SVC(),
          'RF': RandomForestClassifier(),
          'ADA': AdaBoostClassifier(),
          'GBM': GradientBoostingClassifier(),
          'DT': DecisionTreeClassifier(),
          'KNN':KNeighborsClassifier(),
          'QDA':QuadraticDiscriminantAnalysis(),
          'NB': GaussianNB(),
          'GDA':LinearDiscriminantAnalysis()}

In [None]:
def run_experiment(model_list,X,y):
    
    folds = KFold(n_splits=10, random_state=1,shuffle=True)
    print(folds)
    #Standardize the data first
    sc = StandardScaler()
    X = sc.fit_transform(X)
    
    model_names = []
    cv_results = []
    mean_score_list = []
    for model in models.keys():
        scores = cross_val_score(models[model],X,y,scoring='accuracy',cv=folds,n_jobs=-1)
        mean_score_list.append((model,'mean acc: '+str(np.mean(scores)),'std acc: '+str(np.std(scores))))
        print('model name: %s -- mean accuracy: %0.3f || std accuracy: %.3f '%(model,np.mean(scores),np.std(scores)))
        cv_results.append(scores)
        model_names.append(model)
    return model_names,mean_score_list,cv_results

In [None]:
model_names, mean_score_list, cv_results = run_experiment(models,X,y)

In [None]:
def boxplot_comparison(model_names,cv_results):
    
    fig = plt.figure()
    fig.suptitle('BoxPlot Model Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(cv_results)
    ax.set_xticklabels(model_names)
    plt.show()
    
boxplot_comparison(model_names,cv_results)
print('BEST MODELS:\n-----\n\tLR, SVM, RF, GBM, DT\n\nWORST MODELS:\n-----\n\tADA, KNN, QDA, GaussianNB, GDA')

# 6. Optimize best algorithms for better evaluation

In [None]:
models = ['LR','SVM','RF','GBM','DT']

clfs = [
        LogisticRegression(),
        SVC(),
        RandomForestClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier()
       ]

param_space = {
                models[0]:{'penalty':['l1','l2'], 'C':[0.001,0.01], 'solver':['lbfgs','saga'], 'max_iter':[100,200]},
                models[1]:{'C':[0.001,0.003,1,10], 'kernel':['rbf','linear']},
                models[2]:{'n_estimators':[100,150], 'criterion':['gini','entropy']},
                models[3]:{'learning_rate':[0.1,0.01],'n_estimators':[100,150]},
                models[4]:{'criterion':['gini','entropy']}
              }

In [None]:
def grid_optimization(models,clfs,param_space,X,y):
    
    #Standardize the data first
    sc = StandardScaler()
    X = sc.fit_transform(X)
    
    for name,estimator in zip(models,clfs):
        print(name)
        clf = GridSearchCV(estimator,param_space[name],scoring='accuracy',refit='True',cv=5)
        clf.fit(X,y)
        
        #print('best estimator: ',clf.best_estimator_)
        print('best params: ',clf.best_params_)
        print('best scores: %.3f'%clf.best_score_)
        print('-----\n')

In [None]:
grid_optimization(models,clfs,param_space,X,y)
print('Best models after optimisation are:\n-----\n\tSVM, GBM, RF')

# 7. Final evaluation on Test Data

In [None]:
test_raw_df = pd.read_csv('../input/video-games-rating-by-esrb/test_esrb.csv',delimiter=',')
columns = test_raw_df.columns.values
print('Column names: ',columns)
print('-----\n')

esrb_rating_labels = test_raw_df.esrb_rating.unique()
print('Rating Labels: ',esrb_rating_labels)
print('-----\n')

descriptors = columns[1:-1]
print('Independent Variables: ',descriptors) 

In [None]:
#Assign Train and Test
X_test = test_raw_df[descriptors].to_numpy()
y_test = test_raw_df['esrb_rating']
y_test_en = le.transform(y_test)
print('Test X shape: ',X_test.shape)
print('Test y shape: ',y_test_en.shape)

In [None]:
#Declare Models and scaler
sc = StandardScaler()
SVM_opt = SVC(C=1,kernel='rbf')
GBM_opt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=150) 
RF_opt = RandomForestClassifier(criterion='entropy', n_estimators=100)

In [None]:
#Scaling
X_train = sc.fit_transform(X)
y_train = y

X_test = sc.transform(X_test)
y_test = y_test_en

In [None]:
#Fit and Predict
SVM_model = SVM_opt.fit(X_train,y_train)
SVM_preds =SVM_model.predict(X_test)

GBM_model = GBM_opt.fit(X_train,y_train)
GBM_preds = GBM_model.predict(X_test)

RF_model = RF_opt.fit(X_train,y_train)
RF_preds = RF_model.predict(X_test)

In [None]:
print('SVM accuracy score: ',accuracy_score(SVM_preds,y_test))
print('GBM accuracy score: ',accuracy_score(GBM_preds,y_test))
print('RF accuracy score: ',accuracy_score(RF_preds,y_test))
print('-----')
print('SVM precision score: ',precision_score(SVM_preds,y_test,average='macro'))
print('GBM precision score: ',precision_score(GBM_preds,y_test,average='macro'))
print('RF precision score: ',precision_score(RF_preds,y_test,average='macro'))
print('-----')
print('SVM recall score: ',recall_score(SVM_preds,y_test,average='macro'))
print('GBM recall score: ',recall_score(GBM_preds,y_test,average='macro'))
print('RF recall score: ',recall_score(RF_preds,y_test,average='macro'))
print('-----')
print('SVM f1 score: ',f1_score(SVM_preds,y_test,average='macro'))
print('GBM f1 score: ',f1_score(GBM_preds,y_test,average='macro'))
print('RF f1 score: ',f1_score(RF_preds,y_test,average='macro'))

# Conclusion: Random Forest is the best model

In [None]:
import seaborn as sns

#Finally let's see where model is getting confused through a ConfusionMatrix
rf_cm = confusion_matrix(y_test,RF_preds)

rf_cm = rf_cm.astype('float') / rf_cm.sum(axis=1)[:,np.newaxis]
rf_cm = rf_cm * 100

#esrb_rating_labels
labels = le.classes_
sns.heatmap(rf_cm,annot=True)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest Classifier')
plt.show()