# Predict the ESRB rating of video games
## *Compare 13 algorithms*
  
![video games](https://i.imgur.com/4TMUrfq.png)

# Table of contents

[<h3>1. Data Description</h3>](#1)

[<h3>2. Data Preprocessing</h3>](#2)

[<h3>3. Model comparison</h3>](#3)

[<h3>4. Prediction metrics of the best model</h3>](#4)

## Load the libraries:

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression,PassiveAggressiveClassifier,RidgeClassifier,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import LinearSVC, SVC,NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Markdown, display

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))

import warnings
warnings.filterwarnings(action='ignore')

# 1. Data Description<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

- This data contains the name for 1895 games with 34 of ESRB rating content with the name and console as features for each game.

- A single data point is represented as a binary value 0-1 for Console and a binary vector for the features of ESRB content.

- RP, EC, A, rating is not provided in the current version of the data, it might be included in the next updates.

## File descriptions:
- Video_games_esrb_rating.csv - the training set

- test_esrb.csv - the test set

## Load the datasets and visualize it
Having a look at some of the columns: Basically there is the title of the game and its different features like *Blood* or *Mature_Humr* and its ratings *esrb_rating*.

**ESRB rating description:**


In [None]:
train_df = pd.read_csv('../input/video-games-rating-by-esrb/Video_games_esrb_rating.csv')
test_df = pd.read_csv('../input/video-games-rating-by-esrb/test_esrb.csv')
pd.concat([train_df.iloc[:3,:6],train_df.iloc[:3,-1]], axis = 1)

In [None]:
train_df['esrb_rating'].value_counts().plot.bar(color = ['#6b6b6b','#b8b8b8','grey','r'],
                                               figsize = (10,5))
plt.xticks(rotation=0)
plt.title('Repartition of the ESRB ratings\nin the training set', fontsize = 15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

# 2. Data Preprocessing<a class="anchor" id="2"></a><a class="anchor" id="2"></a>

In [None]:
def preprocessing(df):
    df = df.copy()
    
    # Drop title column
    df = df.drop('title', axis=1)
    
    # Shuffle the data
    df = df.sample(frac=1.0, random_state=0).reset_index(drop=True)
    
    X = df.drop('esrb_rating', axis=1)
    y = df['esrb_rating']
    
    X = pd.DataFrame(X, index=X.index, columns=X.columns)
    
    return X, y

# Scale the datasets
scaler = StandardScaler()
X_train,y_train = preprocessing(train_df)
X_train = scaler.fit_transform(X_train)

X_test,y_test = preprocessing(test_df)
X_test = scaler.transform(X_test)

# 3. Model comparison<a class="anchor" id="3"></a><a class="anchor" id="3"></a>

In [None]:
models = {
    "PassiveAggressiveClassifier":{"model":PassiveAggressiveClassifier() },
    "RidgeClassifier":{"model":RidgeClassifier() },
    "SGDClassifier":{"model":SGDClassifier() },
    "KNeighborsClassifier":{"model":KNeighborsClassifier() },
    "DecisionTreeClassifier":{"model":DecisionTreeClassifier() },
    "ExtraTreeClassifier":{"model":ExtraTreeClassifier() },
    "LinearSVC":{"model":LinearSVC() },
    "SVC":{"model":SVC() },
    "NuSVC":{"model":NuSVC() },
    "MLPClassifier":{"model":MLPClassifier() },
    "RandomForestClassifier":{"model":RandomForestClassifier() },
    "GradientBoostingClassifier":{"model":GradientBoostingClassifier() },
    "AdaBoostClassifier":{"model":AdaBoostClassifier() }
}

for name, m in models.items():
    start = perf_counter()
    m['model'].fit(X_train, y_train)
    duration = perf_counter() - start
    duration = round(duration,3)
    test_acc = m['model'].score(X_test,y_test)
    m['test_acc'] = test_acc
    m['Training time (sec)'] = duration
    print(f"{name:27} trained - test set acc: {test_acc*100:.2f}% - trained in {duration} sec")

In [None]:
# Create a DataFrame with the results
models_result = []

for name, v in models.items():
    lst = [name, v['test_acc'],v['Training time (sec)']]
    models_result.append(lst)

df_results = pd.DataFrame(models_result, 
                          columns = ['model','test_acc','Training time (sec)'])
df_results.sort_values(by='test_acc', ascending=False, inplace=True)
df_results.reset_index(inplace=True,drop=True)
df_results

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'test_acc', data = df_results)
plt.title('Accuracy on the test set\nComparison of the different models', fontsize = 15)
plt.ylim(0,1)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Accuracy',fontsize=15)
plt.xticks(rotation=90, fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'Training time (sec)', data = df_results)
plt.title('Training time for each model in sec', fontsize = 15)
plt.xticks(rotation=90, fontsize=12)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Training time (sec)',fontsize=15)
plt.show()

# 4. Prediction metrics of the best model<a class="anchor" id="4"></a><a class="anchor" id="1"></a>

In [None]:
best_model = df_results.iloc[0]
pred = models[best_model[0]]['model'].predict(X_test)
printmd(f'## Best Model: {best_model[0]} with {round(best_model[1],2)*100}% accuracy on the test set')
printmd(f'## Trained in: {best_model[2]} sec')

# Display a confusion matrix
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, pred, normalize='true')
plt.figure(figsize = (7,5))
sns.heatmap(cf_matrix, annot=True, xticklabels = sorted(set(y_test)), yticklabels = sorted(set(y_test)),cbar=False)
plt.title('Normalized Confusion Matrix', fontsize = 23)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()