In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

# Examine The Data

In [None]:
df = pd.read_csv("../input/videogamesales/vgsales.csv")
df.head()

Our data has eleven columns and 16598 rows. 

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe().T

# Data Visualization

In [None]:
sns.pairplot(df, kind="reg")

In [None]:
df.Year.astype("object")
plt.figure(figsize=(18,6))
sns.barplot(x = "Year", y = "NA_Sales", data=df)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(x = "Year", y = "EU_Sales", data=df)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(x = "Year", y = "JP_Sales", data=df)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(x = "Year", y = "Global_Sales", data=df)
plt.xticks(rotation=90);

In [None]:
sns.barplot(x = "Genre", y = "Global_Sales", data=df)
plt.xticks(rotation=90);

## Dropping Unnecessary Features

The features **rank** and **year** will not be used, so we can drop them.

In [None]:
df.drop(["Rank","Year"], axis = 1, inplace=True)

Firstly we gave dropping values in drop function. With **axis = 1** parameter we state that the drop will be made on a column basis. With **inplace = True** parameter we ensure that the process is applied to the data set permanently. 

## Categorical Values

In dataset we have 4 categorical values. These are name, platform, genre, publisher. 

In [None]:
df.Name.value_counts()

In [None]:
print(len(df.Platform.value_counts()))
df.Platform.value_counts()

In [None]:
df.Genre.value_counts()

In [None]:
len(df.Publisher.value_counts())
df.Publisher.value_counts()

### Dropping Categorical Values

In these categorical values we will use **platform** and **genre**. Because of that we will drop **name** and **publisher**.

In [None]:
df.drop(["Name", "Publisher"], axis = 1, inplace=True)

After this process if we look at data name and publisher column will remain.

In [None]:
df.head()

### Encoding Categorical Values

We will use pandas's get_dummies function for this.This operation writes all values ​​in the variable as 0 and 1 and creates separate columns for each.

In [None]:
dms = pd.get_dummies(df[["Platform","Genre"]])
dms

After this process, a dummy variable is created for platform and genre features. Because of that we drop a column for each of them.

In [None]:
dms.drop(["Platform_2600","Genre_Fighting"], axis = 1, inplace=True)

Then we concat df and dms.

In [None]:
df_2 = pd.concat([df,dms], axis=1)
df_2.head()

After this we do not need plotform and genre categorical values so we can drop them.

In [None]:
df_2.drop(["Platform","Genre"], axis=1, inplace=True)

In [None]:
df_2.head()

# Machine Learning Model

In [None]:
X = df_2.drop("Global_Sales", axis=1)
y = df["Global_Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
rf = RandomForestRegressor().fit(X_train, y_train)

In [None]:
r2_score(y_test, rf.predict(X_test))

In [None]:
importance = pd.DataFrame({"Importance": rf.feature_importances_*100},
                         index=X_train.columns)

importance.sort_values(by = "Importance", axis=0, ascending=True).iloc[:10,:10].plot(kind="barh",figsize=(14,6))

In [None]:
pipe = make_pipeline(PCA(n_components=3),rf).fit(X_train,y_train)
print(r2_score(y_test, pipe.predict(X_test)))
print(np.sqrt(mean_squared_error(y_test, pipe.predict(X_test))))

## Model Tuning

In [None]:
rf = RandomForestRegressor()
rf_params = {"n_estimators":[100,200,500],
            "max_depth":[10,15,20],
            "min_samples_split":[2,5,10]}
grid = GridSearchCV(rf, rf_params, cv=10, n_jobs=-1, verbose=2)
pipe = make_pipeline(PCA(n_components=3),grid)
pipe.fit(X_train,y_train)

In [None]:
print(r2_score(y_test, pipe.predict(X_test)))
print(np.sqrt(mean_squared_error(y_test, pipe.predict(X_test))))
grid.best_params_