## Data Preview 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('../input/videogamesales/vgsales.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# !pip install pandas-profiling
import pandas_profiling

df.profile_report()

## Data Manipulation

In [None]:
df.isna().sum()

In [None]:
df['Year'] = df['Year'].fillna(df['Year'].mean())
df['Year'] = df['Year'].astype('int')

In [None]:
df['Publisher'].value_counts()

In [None]:
df['Publisher'] = df['Publisher'].fillna(df['Publisher'].mode()[0])
df.isna().sum()

## Data Visualization for EDA

In [None]:
year_sales=df.groupby(['Year'])['Global_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('Global_Sales')
sns.barplot(x= year_sales.index, y= year_sales.values)

In [None]:
year_na_sales=df.groupby(['Year'])['NA_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('NA_Sales')
sns.barplot(x= year_na_sales.index, y= year_na_sales.values)

In [None]:
year_eu_sales=df.groupby(['Year'])['EU_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('EU_Sales')
sns.barplot(x= year_eu_sales.index, y= year_eu_sales.values)

In [None]:
year_jp_sales=df.groupby(['Year'])['JP_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('JP_Sales')
sns.barplot(x= year_jp_sales.index, y= year_jp_sales.values)

In [None]:
year_other_sales=df.groupby(['Year'])['Other_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('Other_Sales')
sns.barplot(x= year_other_sales.index, y= year_other_sales.values)

In [None]:
genre_sales = df.groupby(['Genre'])['Global_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('Global_Sales')
sns.barplot(x= genre_sales.index, y= genre_sales.values)

In [None]:
genre_na_sales = df.groupby(['Genre'])['NA_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('NA_Sales')
sns.barplot(x= genre_na_sales.index, y= genre_na_sales.values)

In [None]:
genre_eu_sales = df.groupby(['Genre'])['EU_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('EU_Sales')
sns.barplot(x= genre_eu_sales.index, y= genre_eu_sales.values)

In [None]:
genre_jp_sales = df.groupby(['Genre'])['JP_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('JP_Sales')
sns.barplot(x= genre_jp_sales.index, y= genre_jp_sales.values)

In [None]:
genre_other_sales = df.groupby(['Genre'])['Other_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('Other_Sales')
sns.barplot(x= genre_other_sales.index, y= genre_other_sales.values)

In [None]:
platform_sales = df.groupby(['Platform'])['Global_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('Global_Sales')
sns.barplot(x= platform_sales.index, y= platform_sales.values)

In [None]:
platform_na_sales = df.groupby(['Platform'])['NA_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('NA_Sales')
sns.barplot(x= platform_na_sales.index, y= platform_na_sales.values)

In [None]:
platform_eu_sales = df.groupby(['Platform'])['EU_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('EU_Sales')
sns.barplot(x= platform_eu_sales.index, y= platform_eu_sales.values)

In [None]:
platform_jp_sales = df.groupby(['Platform'])['JP_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('JP_Sales')
sns.barplot(x= platform_jp_sales.index, y= platform_jp_sales.values)

In [None]:
platform_other_sales = df.groupby(['Platform'])['Other_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(22,10))
plt.ylabel('Other_Sales')
sns.barplot(x= platform_other_sales.index, y= platform_other_sales.values)

In [None]:
global_publisher=df.groupby(['Publisher'])['Global_Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(22,10))
plt.ylabel('Global_Sales')
sns.barplot(x= global_publisher.index, y= global_publisher.values)


In [None]:
na_publisher=df.groupby(['Publisher'])['NA_Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(22,10))
plt.ylabel('NA_Sales')
sns.barplot(x= na_publisher.index, y= na_publisher.values)


In [None]:
eu_publisher=df.groupby(['Publisher'])['EU_Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(22,10))
plt.ylabel('EU_Sales')
sns.barplot(x= eu_publisher.index, y= eu_publisher.values)

In [None]:
jp_publisher=df.groupby(['Publisher'])['JP_Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(22,10))
plt.ylabel('JP_Sales')
sns.barplot(x= jp_publisher.index, y= jp_publisher.values)

In [None]:
other_publisher=df.groupby(['Publisher'])['Other_Sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(22,10))
plt.ylabel('Other_Sales')
sns.barplot(x= other_publisher.index, y= other_publisher.values)

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
df

## Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

feature = ['Platform', 'Genre', 'Publisher']


for col in feature:
    df[col] = le.fit_transform(df[col])

df = df.drop('Name',axis=1)

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Global_Sales'],axis=1)
y = df['Global_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)



In [None]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Data Modelling & Hyperparameters

 ### K-Nearest-Neighbors Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

knn = KNeighborsRegressor()

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test,y_pred))

print('Accuracy:', knn.score(X_test, y_test))
print('RMSE Error:', rmse)


 ### K-Nearest-Neighbors Regression Hyperparameter

In [None]:
from sklearn.model_selection import GridSearchCV

knn_params = {'n_neighbors': np.arange(1,10,1), 
              'metric':['manhattan','euclidean','minkowski']}

knn = KNeighborsRegressor()

knn_cv_model = GridSearchCV(knn, knn_params, cv = 10)

knn_cv_model.fit(X_train, y_train)

print(knn_cv_model.best_params_["n_neighbors"])
print(knn_cv_model.best_params_["metric"])

 ### K-Nearest-Neighbors Regression Best Hyperparameters

In [None]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"],
                                metric=knn_cv_model.best_params_["metric"])

knn_tuned.fit(X_train, y_train)

y_pred = knn_tuned.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

### Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=32)

dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

print('Accuracy:',dtr.score(X_test, y_test))
print('RMSE Error:',np.sqrt(mean_squared_error(y_test, y_pred)))


### Decision Tree Regression Hyperparameters

In [None]:
dtr_params = {"min_samples_split": range(2,50),
               "max_leaf_nodes": range(2,20)}

dtr = DecisionTreeRegressor()

dtr_cv_model = GridSearchCV(dtr, dtr_params, cv = 10)

dtr_cv_model.fit(X_train, y_train)

print(dtr_cv_model.best_params_["min_samples_split"])
print(dtr_cv_model.best_params_["max_leaf_nodes"])

### Decision Tree Regression Best Hyperparameters

In [None]:
dtr_tuned = DecisionTreeRegressor(min_samples_split = dtr_cv_model.best_params_["min_samples_split"],
                                max_leaf_nodes=dtr_cv_model.best_params_["max_leaf_nodes"])

dtr_tuned.fit(X_train, y_train)

y_pred = dtr_tuned.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=10)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print('Accuracy:',rf.score(X_test, y_test))
print('RMSE Error:',np.sqrt(mean_squared_error(y_test, y_pred)))


### Random Forest Regression Hyperparameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV


rf_params = {'max_depth': list(range(1,5)),
            'max_features': [2,4,8],
            'n_estimators' : [100, 300, 500]}

rf = RandomForestRegressor()

rf_cv_model = RandomizedSearchCV(estimator=rf, param_distributions = rf_params,
                               cv = 10, n_iter = 10, n_jobs=-1)

rf_cv_model.fit(X_train, y_train)

print(rf_cv_model.best_params_["max_depth"])
print(rf_cv_model.best_params_["max_features"])
print(rf_cv_model.best_params_["n_estimators"])

### Random Forest Regression Best Hyperparameters

In [None]:
rf_tuned = RandomForestRegressor(max_depth = rf_cv_model.best_params_["max_depth"],
                                max_features = rf_cv_model.best_params_["max_features"],
                                n_estimators = rf_cv_model.best_params_["n_estimators"])

rf_tuned.fit(X_train, y_train)

y_pred = rf_tuned.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))