# Video Games Sales Analysis and Predictive Modeling

![img](https://cdn.wallpapersafari.com/87/43/siqdob.jpg)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LOAD DATASET

In [None]:
df = pd.read_csv('../input/videogamesales/vgsales.csv')
df

## Removing duplicate and missing values.

In [None]:
df.dropna(inplace=True)
df.drop(columns="Rank",inplace=True)
df = df[df["Year"]<2017.0]
df

In [None]:
missingno.matrix(df)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.loc[:,'Year'] = df.loc[:,'Year'].astype(int)

In [None]:
df.info()

## Casted year column into int, furthermore we found that year ranges from 1980 to 2016.

In [None]:
years = df['Year'].unique()
years.sort()
years

# Dataset Profile Report

In [None]:
games_profile = ProfileReport(df, title='Video Games Sales Profiling')

In [None]:
games_profile

## Gaming Platforms

In [None]:
df['Platform'].unique()

# Correlations

In [None]:
matrix = df.corr()
plt.figure(figsize=(16,12))
g=sns.heatmap(matrix,annot=True,cmap="YlGn")

## Pairplot

In [None]:
sns.pairplot(df)

## Relationship between North America Sales and Global Sales

In [None]:
plt.figure(dpi=125)
sns.regplot(x=df['NA_Sales'],y=df['Global_Sales'])
plt.xlabel('North America Sales')
plt.ylabel('Global Sales')
plt.title('Relationship between North America Sales and Global Sales')
plt.show()

## Relationship between Europe Sales and Global Sales

In [None]:
plt.figure(dpi=125)
sns.regplot(x=df['EU_Sales'],y=df['Global_Sales'])
plt.xlabel('Europe Sales')
plt.ylabel('Global Sales')
plt.title('Relationship between Europe Sales and Global Sales')
plt.show()

## Relationship between Japan Sales and Global Sales

In [None]:
plt.figure(dpi=125)
sns.regplot(x=df['JP_Sales'],y=df['Global_Sales'])
plt.xlabel('Japan Sales')
plt.ylabel('Global Sales')
plt.title('Relationship between Japan Sales and Global Sales')
plt.show()

## Best Publisher in Top 100 Video Games

In [None]:
fig ,ax = plt.subplots(figsize= (12,8))
sns.countplot(df.head(100)['Publisher'], ax=ax)
plt.xlabel('Publisher Name')
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.title('Best Publisher in Top 100 Video Games')
plt.show()

## Sales Distribution

In [None]:
df.hist(figsize=(15,15))
plt.show()
print(df.shape)
print(df.columns)

## Sales by Year

In [None]:
df_by_year = df.groupby(by  = 'Year').sum()
df_by_year

In [None]:
df_by_year=df_by_year.apply(lambda x : x.astype("int"))

In [None]:
df_by_year.info()

# Sales Analysis

In [None]:
df_by_year.plot.line(figsize=(10,10), grid="on");
plt.ylabel("Sales in million $");

In [None]:
fig ,ax = plt.subplots(figsize= (16,12))
sns.kdeplot(data=df_by_year, ax=ax)

In [None]:
df_by_year.reset_index(inplace=True)
df_by_year

In [None]:
fig ,ax = plt.subplots(figsize= (12,8))
p = sns.stripplot(x=df_by_year['Year'],
              y=df_by_year['Global_Sales'], ax=ax)
p.set_xticklabels(labels = df_by_year['Year'],rotation=90)

In [None]:
fig ,axs = plt.subplots(3,2 ,figsize= (16,16))
fig.set_facecolor("white")
fig.delaxes(axs[2,1])

def jointplot(cplot,data,region):
    x = cplot//2
    y = cplot%2
    sns.kdeplot(ax=axs[x,y], data=data, x=df_by_year['Year'], y=region)
    axs[x,y].set_title(region)
    
cplot=0
region = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
for i in region:
    jointplot(cplot, df_by_year, i)
    cplot+=1
    
fig.tight_layout()
plt.plot()

# Predictive Modeling

In [None]:
X = df[['Year','NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
y = df['Global_Sales']

## Multiple Linear Regression (scikit-learn)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
R_squared = reg.score(X_test, y_test)
R_squared

## R2 Score : 0.9999860496809578

In [None]:
y_pred

In [None]:
y_pred_train

## kNN Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
kRange = range(1,15,1)

scores_list = []
for i in kRange:
    regressor_knn = KNeighborsRegressor(n_neighbors = i)
    
    regressor_knn.fit(X_train,y_train)
    pred = regressor_knn.predict(X_test)
    
    scores_list.append(r2_score(y_test,pred))

In [None]:
regressor_knn = KNeighborsRegressor(n_neighbors = 3)

regressor_knn.fit(X_train,y_train)
pred = regressor_knn.predict(X_test)

r2_knn = r2_score(y_test,pred)
print(r2_knn)

## R2 Score : 0.9538189722969099

![go](https://www.itl.cat/pngfile/big/297-2974093_game-over-video-games-retro-games-distortion-wallpapers.jpg)