In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import r2_score, mean_squared_error
import math
from sklearn.linear_model import BayesianRidge, LinearRegression

# Importing the dataset
data = pd.read_csv('/kaggle/input/videogames-sales-dataset/Video_Games_Sales_as_at_22_Dec_2016.csv')
# Dropping features that are unnecessary for sale prediction
data.drop(columns=['Year_of_Release', 'Developer', 'Publisher', 'Platform'], inplace=True)

# Defining X and y
X = data.iloc[:, :].values
X = np.delete(X, 6, 1)
y = data.iloc[:, 6:7].values

# Dropping the column that contains the name of the games as they are not needed for the prediction model
X = X[:, 1:]

# Replacing all NaN values to 'NA' for both the test and train data
imp_const = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='NA')
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

X[:, [5, 6, 7, 8]] = imp_mean.fit_transform(X[:, [5, 6, 7, 8]])
X[:, [0, 9]] = imp_const.fit_transform(X[:, [0, 9]])

# Encoding the categories into a string of binary values
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 9])], remainder='passthrough')
X = ct.fit_transform(X)



# Our models
clf = tree.DecisionTreeRegressor(max_depth=200)
regr = svm.SVR(kernel='linear')
clf_2 = BayesianRidge(compute_score=True)



r2_dec_list = []
error_dec_list = []
r2_svr_list = []
error_svr_list = []
r2_bay_list = []
error_bay_list = []

for x in range(3):
    
    randomize = np.random.uniform(0.05, 1.0)
    # Splitting the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=randomize, random_state=0)

    
    # Decision Tree Regression
    clf = clf.fit(X_train, y_train)
    y_pred_1 = clf.predict(X_test)

    # R2 score and mean squared error
    r2_score_1 = r2_score(y_test, y_pred_1)
    r2_dec_list.append(r2_score_1)
    rmse = math.sqrt(mean_squared_error(y_test, y_pred_1))
    error_dec_list.append(rmse)
   
    #print(f"r2 score of the model : {r2_score_1:.3f}")
    #print(f"Root Mean Squared Error of the model : {rmse:.3f}")
    
   

    # Support Vector Regression
    regr.fit(X_train, y_train.ravel())
    y_pred_2 = regr.predict(X_test)

    # R2 score and mean squared error
    r2_score_2 = r2_score(y_test, y_pred_2)
    r2_svr_list.append(r2_score_2)
    rmse_2 = math.sqrt(mean_squared_error(y_test, y_pred_2))
    error_svr_list.append(rmse_2)
    
    #print(f"r2 score of the model : {r2_score_2:.3f}")
    #print(f"Root Mean Squared Error of the model : {rmse_2:.3f}")
    

    # Bayesian Ridge Regressor
    
    clf_2.fit(X_train, y_train.ravel())
    y_pred_3 = clf_2.predict(X_test)

    # R2 score and mean squared error
    r2_score_3 = r2_score(y_test, y_pred_3)
    r2_bay_list.append(r2_score_3)
    rmse_3 = math.sqrt(mean_squared_error(y_test, y_pred_3))
    error_bay_list.append(rmse_3)
    
    #print(f"r2 score of the model : {r2_score_3:.3f}")
    #print(f"Root Mean Squared Error of the model : {rmse_3:.3f}")



# Decision Tree
# Making plot where we are comparing predicted values with the actual ones

fig, ax = plt.subplots()
plt.title('Comparison of predicted and actual values in Decision Tree Regression', y=1.01, size=16)
ax.scatter(y_test, y_pred_1)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.xlim([0, 45])
plt.ylim([0, 45])
plt.figtext(.01, .00008, 'Figure 1: Visual representation of comparison between predicted and actual sales when using '
                  'Decision Tree Regression')
plt.show()

# Average scores
r2_dec_avg = np.average(r2_dec_list)
print('Average r2 score for Decision Tree Regression over ' + str(x+1) + ' iterations: ', r2_dec_avg)
error_dec_avg = np.average(error_dec_list)
print('Average Root Mean Square Error score for Decision Tree Regression over ' + str(x+1) + ' iterations: ', error_dec_avg)


#SVR
# The plot again
fig2, ax = plt.subplots()
plt.title('Comparison of predicted and actual values in Support Vector Regression', y=1.01, size=16)
ax.scatter(y_test, y_pred_2)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.xlim([0, 45])
plt.ylim([0, 45])
plt.figtext(.01, .00008, 'Figure 2: Visual representation of comparison between predicted and actual sales when using '
                      'Support Vector Regression')
plt.show()

# Averages
r2_svr_avg = np.average(r2_svr_list)
print('Average r2 score for Support Vector Regression over ' + str(x+1) + ' iterations: ', r2_svr_avg)
error_svr_avg = np.average(error_svr_list)
print('Average Root Mean Square Error score for Support Vector Regression over ' + str(x+1) + ' iterations: ', error_svr_avg)

#Bayesion Ridge
# The plot again
fig3, ax = plt.subplots()
plt.title('Comparison of predicted and actual values in Bayesian Ridge Regression', y=1.01, size=16)
ax.scatter(y_test, y_pred_3)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.xlim([0, 45])
plt.ylim([0, 45])
plt.figtext(.01, .00008, 'Figure 3: Visual representation of comparison between predicted and actual sales when using '
                      'Bayesian Ridge Regression')
plt.show()

# Averages
r2_bay_avg = np.average(r2_bay_list)
print('Average r2 score for Bayesian Ridge Regression over ' + str(x+1) + ' iterations: ', r2_bay_avg)
error_bay_avg = np.average(error_bay_list)
print('Average Root Mean Square Error score for Bayesian Ridge Regression over ' + str(x+1) + ' iterations: ', error_bay_avg)