In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data_df = pd.read_csv('../input/videogamesales/vgsales.csv')
data_df.head()

In [None]:
data_df.info()

In [None]:
data_df.isnull().sum()

In [None]:
data_df = data_df.dropna(subset=['Year','Publisher'],axis=0)
data_df.isnull().sum()

#  Sales VS Year

In [None]:
AnnualSalesMarket = data_df.groupby('Year')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum().reset_index()
plt.plot(AnnualSalesMarket['Year'], AnnualSalesMarket['NA_Sales'], label='North America Sales')
plt.plot(AnnualSalesMarket['Year'], AnnualSalesMarket['EU_Sales'], label='Europe Sales')
plt.plot(AnnualSalesMarket['Year'], AnnualSalesMarket['JP_Sales'], label='Japan Sales')
plt.plot(AnnualSalesMarket['Year'], AnnualSalesMarket['Other_Sales'], label='Other Sales')
plt.ylabel('Sales')
plt.xlabel('Years')
plt.title('Sales VS Years')
plt.legend()
plt.show()

In [None]:
data = data_df.drop(['Rank','Name','Year','JP_Sales','Other_Sales','Global_Sales'], axis=1)
data.head()

In [None]:
le = LabelEncoder()
data['Platform'] = le.fit_transform(data['Platform'].astype('str'))
data['Genre'] = le.fit_transform(data['Genre'].astype('str'))
data['Publisher'] = le.fit_transform(data['Publisher'].astype('str'))
data.head()

In [None]:
y = data_df['Global_Sales']

In [None]:
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.30)

# Linear Regression

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
Acc_reg = reg.score(X_train, y_train)
acc_reg = reg.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_reg*100))
print ('Test Accuracy : {:.2f}%'.format(acc_reg*100))

In [None]:
print('Mean squared error: %.2f'% mean_squared_error(y_test, y_pred))
print('Variance Score: %.2f'% r2_score(y_test, y_pred))

# SVR

In [None]:
model_svr = SVR()
model_svr.fit(X_train, y_train)

In [None]:
pred_svr = model_svr.predict(X_test)

In [None]:
Acc_svr = model_svr.score(X_train, y_train)
acc_svr = model_svr.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_svr*100))
print ('Test Accuracy : {:.2f}%'.format(acc_svr*100))

In [None]:
print('Mean squared error: %.2f'% mean_squared_error(y_test, pred_svr))
print('Variance Score: %.2f'% r2_score(y_test, pred_svr))

# KNeighborsRegressor

In [None]:
neigh = KNeighborsRegressor()
neigh.fit(X_train, y_train)

In [None]:
pred_neigh = neigh.predict(X_test)

In [None]:
Acc_neigh = neigh.score(X_train, y_train)
acc_neigh = neigh.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_neigh*100))
print ('Test Accuracy : {:.2f}%'.format(acc_neigh*100))

In [None]:
print('Mean squared error: %.2f'% mean_squared_error(y_test, pred_neigh))
print('Variance Score: %.2f'% r2_score(y_test, pred_neigh))

# Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train,y_train)

In [None]:
pred_gbr = gbr.predict(X_test)

In [None]:
Acc_gbr = gbr.score(X_train, y_train)
acc_gbr = gbr.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_gbr*100))
print ('Test Accuracy : {:.2f}%'.format(acc_gbr*100))

In [None]:
print('Mean squared error: %.2f'% mean_squared_error(y_test, pred_gbr))
print('Variance Score: %.2f'% r2_score(y_test, pred_gbr))

# Decision Tree Regressor

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [None]:
pred_dtr = dtr.predict(X_test)

In [None]:
Acc_dtr = dtr.score(X_train, y_train)
acc_dtr = dtr.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_dtr*100))
print ('Test Accuracy : {:.2f}%'.format(acc_dtr*100))

In [None]:
print('Mean squared error: %.2f'% mean_squared_error(y_test, pred_dtr))
print('Variance Score: %.2f'% r2_score(y_test, pred_dtr))

# Random Forest Regressor

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
pred_rfr = rfr.predict(X_test)

In [None]:
Acc_rfr = rfr.score(X_train, y_train)
acc_rfr = rfr.score(X_test, y_test)
print ('Train Accuracy : {:.2f}%'.format(Acc_rfr*100))
print ('Test Accuracy : {:.2f}%'.format(acc_rfr*100))

In [None]:
print('Mean squared error: %.2f'% mean_squared_error(y_test, pred_rfr))
print('Variance Score: %.2f'% r2_score(y_test, pred_rfr))

# Final Report

In [None]:
output = pd.DataFrame({"Model":['Linear Regression','SVR','KNeighborsRegressor',
                                'Gradient Boosting Regressor','Decision Tree Regressor',
                               'Random Forest Regressor'],
                      "Accuracy":[acc_reg, acc_svr, acc_neigh, acc_gbr, acc_dtr, acc_rfr]})

In [None]:
output

In [None]:
sns.barplot(x='Accuracy', y='Model', data=output)