# PS4 games rating predictions
## *Comparing 8 regression algorithms*

![video games](https://i.imgur.com/9U3sXgz.png)

# Table of contents

[<h3>1. Data Description</h3>](#1)

[<h3>2. Data Preprocessing</h3>](#2)

[<h3>3. Model comparison</h3>](#3)

[<h3>4. Prediction metrics of the best model using the test set</h3>](#4)

[<h3>5. Visualization of the result</h3>](#5)

## Context
This dataset include all games for PlayStation 4 for the present.
I used the truetrophies website to create this dataset.

## Content
You can find 1 datasets :
games_data.csv: contend up to date list of PlayStation 4 (PS4) games , games name and some details like score, rating for each game etc.

This dataset includes 1584 games information



# Load the libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))

import warnings
warnings.filterwarnings(action='ignore')

# 1. Data Description<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

In [None]:
df = pd.read_csv('../input/ps4-games/games_data.csv', index_col = 0)
df.head()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))

df['rating'].plot.hist(by='rating',ax = axes[0], color = '#ff8c8e')
axes[0].set_title('Rating\'s histogram', fontsize = 15)

df['rating'].plot.box(ax = axes[1])
axes[1].set_title('Rating\'s Boxplot', fontsize = 15)

sns.violinplot(ax = axes[2], y = 'rating', data = df, color = '#ff8c8e')
axes[2].set_title('Rating\'s distribution (violinplot)', fontsize = 15)

plt.show()

In [None]:
printmd(f'### Number of rows in the dataset: {df.shape[0]}')

# 2. Data Preprocessing<a class="anchor" id="2"></a><a class="anchor" id="2"></a>

In [None]:
def preprocessing(df):
    df = df.copy()
    
    # Drop game and URL columns
    df = df.drop('game', axis=1)
    df = df.drop('url', axis=1)
    
    # Shuffle the data
    df = df.sample(frac=1.0, random_state=0).reset_index(drop=True)
    
    X = df.drop('rating', axis=1)
    y = df['rating']
    
    X = pd.DataFrame(X, index=X.index, columns=X.columns)
    
    return X, y

# Preprocessing
X,y = preprocessing(df)

# Split into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Scale the datasets
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the result
X_train[:2], y_train[:2]

# 3. Model comparison<a class="anchor" id="3"></a><a class="anchor" id="3"></a>

In [None]:
models = {
    "LinearRegression":{"model":LinearRegression() },
    "Lasso":{"model":Lasso() },
    "Ridge":{"model":Ridge() },
    "DecisionTreeRegressor":{"model":DecisionTreeRegressor() },
    "RandomForestRegressor":{"model":RandomForestRegressor() },
    "MLPRegressor":{"model":MLPRegressor() },
    "GradientBoostingRegressor":{"model":GradientBoostingRegressor() },
    "AdaBoostRegressor":{"model":AdaBoostRegressor() }
}

# Use the K-fold cross validation for each model
# to get the mean validation accuracy and the mean training time
k = 10
for name, m in models.items():
    # Cross validation of the model
    model = m['model']
    result = cross_validate(model, X_train,y_train, cv = k, scoring='neg_mean_squared_error')
    
    # Mean accuracy and mean training time
    result['test_score'] = result['test_score']
    mean_RMSE = [(-x)**0.5 for x in result['test_score']] # Root Mean Square Error
    mean_RMSE = sum(mean_RMSE)/len(mean_RMSE)
    mean_RMSE = round(mean_RMSE,4)
    mean_fit_time = round( sum(result['fit_time']) / len(result['fit_time']), 4)
    
    # Add the result to the dictionary witht he models
    m['mean_RMSE'] = mean_RMSE
    m['Training time (sec)'] = mean_fit_time
    
    # Display the result
    print(f"{name:27} mean MSRE for {k}-fold CV: {mean_RMSE} - mean training time {mean_fit_time} sec")

In [None]:
# Create a DataFrame with the results
models_result = []

for name, v in models.items():
    lst = [name, v['mean_RMSE'],v['Training time (sec)']]
    models_result.append(lst)

df_results = pd.DataFrame(models_result, 
                          columns = ['model','RMSE','Training time (sec)'])
df_results.sort_values(by='RMSE', ascending=True, inplace=True)
df_results.reset_index(inplace=True,drop=True)
df_results

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'RMSE', data = df_results)
plt.title(f'{k}-fold mean RMSE for each Model\nSmaller is better', fontsize = 15)
# plt.ylim(0.8,1.005)
plt.xlabel('Model', fontsize=15)
plt.ylabel('RMSE',fontsize=15)
plt.xticks(rotation=90, fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'Training time (sec)', data = df_results)
plt.title('Training time for each Model in sec\nSmaller is better', fontsize = 15)
plt.xticks(rotation=90, fontsize=12)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Training time (sec)',fontsize=15)
plt.show()

# 4. Prediction metrics of the best model using the test set<a class="anchor" id="4"></a><a class="anchor" id="1"></a>

In [None]:
# Get the model with the highest mean validation accuracy
best_model = df_results.iloc[0]

# Fit the model
model = models[best_model[0]]['model']
model.fit(X_train,y_train)

# Predict the labels with the data set
pred = model.predict(X_test)

MSRE = mean_squared_error(y_test,pred)**0.5
MSRE = round(MSRE, 2)

# Display the results
printmd(f'### Best Model: {best_model[0]} with a MSRE of {MSRE} on the test set')
printmd(f'### Trained in: {best_model[2]} sec')

# 5. Visualization of the result<a class="anchor" id="5"></a>

In [None]:
# Concatenate the ratings of the test set
# with the predictions of those ratings
pred_s = pd.Series(pred)
y_test_s = y_test.reset_index(drop=True)

df_result = pd.concat([y_test_s,pred_s], axis = 1)
df_result.columns = ['Real Rating', 'Predicted Rating']
df_result.head(5)

In [None]:
df_result.plot.box()
plt.title('Boxplot Real Rating VS Predicted Rating', fontsize = 15)
plt.show()

df_result.plot.scatter(x='Real Rating', y='Predicted Rating')
plt.title('Scatterplot Real Rating VS Predicted Rating', fontsize = 15)
plt.show()