# Introduction

Hello everyone! This is my data analysis with the video game sales dataset. I am actually very excited to explore more about this dataset because I am myself a video game fan. So let's load the data and take a quick look!

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
full_data = pd.read_csv('/kaggle/input/videogamesales/vgsales.csv')

In [None]:
full_data.head()

In [None]:
print(full_data.columns.unique())
len(full_data.columns.unique())

In [None]:
full_data.info()

In [None]:
print(full_data.columns[full_data.isna().any()].unique())
len(full_data.columns[full_data.isna().any()].unique())

OK, so we see that this dataset contains 11 columns of information.


**The columns are:**

* Rank - Ranking of overall sales, integer

* Name - The games name

* Platform - Platform of the games release (i.e. PC,PS4, etc.), object

* Year - Year of the game's release, float

* Genre - Genre of the game ,object

* Publisher - Publisher of the game, object

* NA_Sales - Sales in North America (in millions), float

* EU_Sales - Sales in Europe (in millions), float

* JP_Sales - Sales in Japan (in millions), float

* Other_Sales - Sales in the rest of the world (in millions), float

* Global_Sales - Total worldwide sales, float


We also see that two of the columns contain missing values. Let's take a quick look at these two columns.

**Year**

In [None]:
full_data['Year'].isna().sum()

In [None]:
full_data['Year'].isna().sum() / full_data['Year'].count() * 100

Since only 1.6% of the data has missing values, I think it is OK to fill the missing value. So let's take a deeper look at the data and how we can impute missing values.

In [None]:
full_data.loc[full_data['Year'].isna()].head()

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Years")
sns.distplot(a=full_data['Year'])
plt.show()

full_data['Year'].value_counts() / full_data['Year'].dropna().count() * 100

Since there is not a single year that is very common, I will not fill missing values with the median or something similar. Instead, let's see if we can use a classifier to predict missing values. First, let's see which columns are most correlated with Years:

In [None]:
year_corr = full_data.corr()["Year"]
year_corr.abs().sort_values(ascending=False)[1:]

So Rank and JP_Sales are the two numerical variables that are correlated with Year. Let's now see if there are categorical variables that are correlated:

**Platform**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Platform vs. Year")
sns.scatterplot(x=full_data['Platform'], y=full_data['Year'])
plt.show()

The platform seems to be an indication of the year because there are platforms that didn't release games during the 1980s. So I will include this feature as an input to the model.

**Genre**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Genre vs. Year")
sns.scatterplot(x=full_data['Genre'], y=full_data['Year'])
plt.show()

I cannot see a trend here. It seems like each year there are games with every genre created.

**Publisher**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Publisher vs. Year")
sns.scatterplot(x=full_data['Publisher'], y=full_data['Year'])
plt.show()

Although not very clear, we can see that certain publishers never released games during 1980s. So publisher perhaps can also be an useful variable to add to our model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

year_num_features = ['Rank', 'JP_Sales']
year_cat_features = ['Publisher', 'Platform']
year_features = year_num_features + year_cat_features

num_transformer = SimpleImputer(strategy="constant")

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[("num", num_transformer, year_num_features),
                                               ("cat", cat_transformer, year_cat_features)])

year_X = full_data.dropna()[year_features]
year_Y = full_data.dropna()['Year']

year_X_train, year_X_test, year_y_train, year_y_test = train_test_split(year_X, year_Y, test_size=0.33, random_state=42)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


base_models = [("Ada_model", AdaBoostClassifier(random_state=42)),
               ("RF_model", RandomForestClassifier(random_state=42)),
               ("KN_model", KNeighborsClassifier())]

kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for name, model in base_models:
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    model_steps.fit(year_X_train, year_y_train)

    model_preds = model_steps.predict(year_X_test)

    print(f"{name} accuracy: {accuracy_score(year_y_test, model_preds)}")

Since Random Forest has the best performance, let's fill missing values using the year it predicts

In [None]:
final_year_X_train = full_data[full_data['Year'].notnull()][year_features]
final_year_y_train = full_data[full_data['Year'].notnull()][['Year']]

final_year_X_test = full_data[full_data['Year'].isnull()][year_features]

In [None]:
model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestClassifier(random_state=42))])

model_steps.fit(final_year_X_train, np.ravel(final_year_y_train))

final_model_preds = model_steps.predict(final_year_X_test)

In [None]:
full_data.loc[full_data['Year'].isnull(), 'Year'] = final_model_preds

In [None]:
full_data['Year'].isna().sum()

Now we have successfully filled the missing values in Year using predictions from a Random Forest Classifier.

**Publisher**

In [None]:
full_data['Publisher'].isna().sum()

In [None]:
print(len(full_data['Publisher'].unique()))
print(full_data['Publisher'].unique())

In [None]:
full_data.loc[full_data['Publisher'].isna()].head(20)

I don't want to drop data, so here I will just give them a Publisher value of 'Unknown'

In [None]:
full_data.loc[full_data['Publisher'].isna(), 'Publisher'] = 'Unknown'

In [None]:
full_data['Publisher'].isna().sum()

# Univariate Data Analysis

Now we have take a quick look at the data. I say we can see more distributions of variables regarding video game sales. (Rank and names obviously don't need to be examined individually)

**Platform**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Platforms")
sns.countplot(x=full_data['Platform'], order=full_data['Platform'].value_counts().index)
plt.show()

print(full_data['Platform'].value_counts() / full_data.shape[0] * 100)

So we see that the most popular platforms are like DS, PS2, PS3, PC, and etc. This follows our expectations.

**Year**

In [None]:
plt.figure(figsize=(35, 10))
plt.title("Year")
sns.countplot(x=full_data['Year'], order=full_data['Year'].value_counts().index)
plt.show()

print(full_data['Year'].value_counts() / full_data.shape[0] * 100)

We see that most games were released around 2010, which I think is partly due to the time this dataset was collected and partly because that time period was when devices like computer or game station really gained popularity.

**Genre**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Genre")
sns.countplot(x=full_data['Genre'], order=full_data['Genre'].value_counts().index)
plt.show()

print(full_data['Genre'].value_counts() / full_data.shape[0] * 100)

This shows that the most popular genres are action and sports. Again, this kinda follows our expectations. Action games are usually the most popular kind.

**Publisher**

Since there are too many publishers, let's just focus on the top 10 most popular publishers.

In [None]:
top_ten = full_data['Publisher'].value_counts().head(10)

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Publisher")
sns.countplot(x=full_data['Publisher'], order=top_ten.index)
plt.show()

print(top_ten / full_data.shape[0] * 100)

So basically the companies that sell the most amount of games are like EA, Activision, NBG, Ubisoft, and others. These companies are indeed the most famous ones out there.

**Sales in North America**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("North America Sales")
sns.distplot(a=full_data['NA_Sales'], kde=False)
plt.show()

print(full_data['NA_Sales'].describe())

In [None]:
plt.figure(figsize=(15, 10))
plt.title("North America Sales")
sns.boxplot(x=full_data['NA_Sales'])
plt.show()

print(full_data['NA_Sales'].describe())

OK, so we see that the sales in North America is highly right-skewed. Most companies will not even make a million sales in NA, but there is a game that sold 41.49 million times! I wonder what game that is. Let's check out.

In [None]:
full_data.loc[full_data['NA_Sales'] == 41.49]

Wii Sports it is. Also, this is actually the #1 game in the rank! No wonder why the game is this popular in North America.

**Sales in Europe**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Europe Sales")
sns.distplot(a=full_data['EU_Sales'], kde=False)
plt.show()

print(full_data['EU_Sales'].describe())

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Europe Sales")
sns.boxplot(x=full_data['EU_Sales'])
plt.show()

print(full_data['EU_Sales'].describe())

Again, the distribution is highly right skewed. This again indicates that the video game market is somewhat an oligopoly. Most companies will only have a small share in the market, but there are a few that will sell exponentially more. Also, let's check which game has the most sales in Europe. 

In [None]:
full_data.loc[full_data['EU_Sales'] == 29.02]

Well, it is again Wii Sports.

**Sales in Japan**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Japan Sales")
sns.distplot(a=full_data['JP_Sales'], kde=False)
plt.show()

print(full_data['JP_Sales'].describe())

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Japan Sales")
sns.boxplot(x=full_data['JP_Sales'])
plt.show()

print(full_data['JP_Sales'].describe())

Still right-skewed, and the game that has the most sale is:

In [None]:
full_data.loc[full_data['JP_Sales'] == 10.22]

Wow! This time it is not Wii Sport! It is actually Pokemon Red/Blue! This is actually a surprise to me. I would though that since Wii Sport is the number 1 game and is from Japan, Wii Sport would be the number 1 at Japan too. I guess this is the beauty of data analysis.

**Sales in the rest of the world**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Other Sales")
sns.distplot(a=full_data['Other_Sales'], kde=False)
plt.show()

print(full_data['Other_Sales'].describe())

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Other Sales")
sns.boxplot(x=full_data['Other_Sales'])
plt.show()

print(full_data['Other_Sales'].describe())

In [None]:
full_data.loc[full_data['Other_Sales'] == 10.57]

OK, now for the other parts of the world, GTA San Andreas is actully the most sold game. Again, an interesting thing to learn. 

**Global Sales**

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Global Sales")
sns.distplot(a=full_data['Global_Sales'], kde=False)
plt.show()

print(full_data['Global_Sales'].describe())

In [None]:
plt.figure(figsize=(15, 10))
plt.title("Global Sales")
sns.boxplot(x=full_data['Global_Sales'])
plt.show()

print(full_data['Global_Sales'].describe())

In [None]:
full_data.loc[full_data['Global_Sales'] == 82.74]

Now the global sales is shown. Again, it is visualized, and the game with the most sales is Wii Sport, which is indeed ranked #1 in our dataset.

# Define the Question

Now we have looked at the data, I begin to be curious about what factors will affect the global sales of a game. Obviously when predicting about this, we cannot use any of the region sale information or the rank as parameters for our model because that is data leakage. Therefore, we are left to only the Platform, Year, Genre, and Publisher columns. So I say let's now work on making a prediction of the global sales based on these four columns and see how accurate the prediction is.

# Other Parameters vs. Global Sales

Now since we are curious in how other parameters may affect the final global sales, let's actually visualize the associations between global sales and other parameters we are interested in. 

**Year vs. Global Sales**

In [None]:
plt.figure(figsize=(35, 10))
plt.title("Year")
sns.scatterplot(x=full_data['Year'], y=full_data['Global_Sales'])
plt.show()

There isn't really an association between the year and the global sales of games. It is shown that most games would only make minimal global sales despite which year it is. So the global sale isn't really something that depends on the time.

**Platform vs. Global Sales**

In [None]:
plt.figure(figsize=(35, 10))
plt.title("Platform")
sns.barplot(x=full_data['Platform'], y=full_data['Global_Sales'])
plt.show()

for plat in full_data['Platform'].unique():
    print(plat, " ", full_data.loc[full_data['Platform'] == plat, 'Global_Sales'].median())

So now there seems to be a little trend. It seems like games on platforms like Wii, NES, GB do have a higher global sales comparing to games on platforms like SAT. It is very likely that since devices like Wii are more popular, people are more likely to invest money on games that are on these platforms. 

**Genre vs. Global Sales**

In [None]:
plt.figure(figsize=(35, 10))
plt.title("Genre")
sns.barplot(x=full_data['Genre'], y=full_data['Global_Sales'])
plt.show()

for genr in full_data['Genre'].unique():
    print(genr, " ", full_data.loc[full_data['Genre'] == genr, 'Global_Sales'].median())

Again, small association between the genre of the game and the global sales. However, still not a significant association.

**Publisher vs. Global Sales**

In [None]:
sale_pbl = full_data[['Publisher', 'Global_Sales']]
sale_pbl = sale_pbl.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending=False).head(20)
sale_pbl = pd.DataFrame(sale_pbl).reset_index()
# sale_pbl

In [None]:
plt.figure(figsize=(15, 10))
sns.barplot(x='Publisher', y='Global_Sales', data=sale_pbl)
plt.xticks(rotation=90)

The publisher also seems to have an effect on the global sales. Essentially, companies like Nintendo or EA are more likely to make games that have high sales. 

In conclusion, among these four parameters, it seems like the Year doesn't have a huge impact on the global sale, but the other three parameters all have an impact to certain extent. 

# Model

In [None]:
num_features = []
cat_features = ['Platform', 'Genre', 'Publisher']

features = num_features + cat_features
X = full_data.drop(["Global_Sales"], axis=1)[features]
y = full_data["Global_Sales"]

num_transformer = SimpleImputer(strategy="constant")

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[("num", num_transformer, num_features),
                                               ("cat", cat_transformer, cat_features)])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

base_models = [("DT_model", DecisionTreeRegressor(random_state=42)),
               ("RF_model", RandomForestRegressor(random_state=42,n_jobs=-1)),
               ("GB_model", GradientBoostingRegressor(random_state=42)),
               ("Ada_model", AdaBoostRegressor(random_state=42)),
               ("KNN_model", KNeighborsRegressor(n_jobs=-1))]

kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for name, model in base_models:
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    model_steps.fit(X_train, y_train)

    model_preds = model_steps.predict(X_test)

    print(f"{name} mean squared error result: {mean_squared_error(y_test, model_preds)}")

So these are the performances of baseline models. From the results we can see Random Forest is again the best baseline model.

In [None]:
param_grid = { 
    'n_estimators': [100, 500, 1000],
    'max_depth' : [3, 5, 8, None],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
final_RF_model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=None, max_features='auto')

model_steps = Pipeline(steps=[('preprocessor', preprocessor), ('model', final_RF_model)])

model_steps.fit(X_train, y_train)

model_preds = model_steps.predict(X_test)

print("Mean Squared Error: ", mean_squared_error(y_test, model_preds))

After tuning parameters, I found the default parameters actually have the lowest error. So the final performance of our model is a 3.17 mean squred error when predicting global sales for a video game.