# INTRODUCTION



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
from collections import Counter
from plotly.offline import init_notebook_mode, iplot # plotly offline mode
init_notebook_mode(connected=True) 
import plotly.graph_objs as go # plotly graphical object

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
#plt.style.use('ggplot')
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/videogamesales/vgsales.csv").drop('Rank', axis = 1)
df.Year = df.Year.fillna(0000.0)
df.Year.isnull().any()
df['Year'] = df['Year'].astype(int)

df.head()


In [None]:
len(df)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df['Name'].value_counts()

## Varible Description
* Rank          int64  
* Name          object 
* Platform      object 
* Year          float64
* Genre         object 
* Publisher     object 
* NA_Sales      float64
* EU_Sales      float64
* JP_Sales      float64
* Other_Sales   float64
* Global_Sales  float64

### Categorical Analysis
* Name          object 
* Platform      object 
* Genre         object 
* Publisher     object 

In [None]:
def bar_plot(attribute):
    
    variable = df[attribute]
    count = variable.value_counts()
    
    plt.figure(figsize = (8,5))
    plt.bar(count.index, count)
    plt.xticks(count.index, count.index.values, rotation = 90)
    plt.xlabel("Attribute")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
attr_list = ['Name','Platform','Genre','Publisher']
for item in attr_list:
    bar_plot(item)

### Numerical Analysis
* Rank          int64  
* NA_Sales      float64
* EU_Sales      float64
* JP_Sales      float64
* Other_Sales   float64
* Global_Sales  float64

In [None]:
def histogram(data, attribute):
    
    plt.figure(figsize = (10,9))
    plt.hist(data[attribute], bins = 50)
    plt.xlabel(attribute)
    plt.ylabel("Frequency")
    plt.title(attribute+ " Frequency",color = "red")
    plt.show()

In [None]:
categorical = ["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"]
for each in categorical:
    histogram(df, each)

### Oulier Detection

In [None]:
def outlier(data, attributes):
    
    outlier_list = []
    
    for item in attributes:
        Q1 = np.percentile(data[item],25)
        Q3 = np.percentile(data[item],75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_det = data[(data[item] < Q1 - outlier_step) | (data[item] > Q3 + outlier_step)].index
        outlier_list.extend(outlier_det)
    
    outlier_list = Counter(outlier_list)
    multiple_outliers = list(i for i, v in outlier_list.items() if v > 2)
    return multiple_outliers
    
    

In [None]:
df.loc[outlier(df, ["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales"])]

Outlier detection is not a good method for this dataset. Almost all rows are outliers.

### Basic Data Analysis

In this chapter, I simply analyse data according to different attributes of it.

In [None]:
df.info()

In [None]:
na_sales = df[['NA_Sales','Genre']].groupby('Genre',as_index = False).mean().sort_values(by = 'NA_Sales', ascending = True)
plt.scatter(na_sales.iloc[:,0],na_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In [None]:
eu_sales = df[['EU_Sales','Genre']].groupby('Genre',as_index = False).mean().sort_values(by = 'EU_Sales', ascending = True)
plt.scatter(eu_sales.iloc[:,0],eu_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In [None]:
jp_sales = df[['JP_Sales','Genre']].groupby('Genre',as_index = False).mean().sort_values(by = 'JP_Sales', ascending = True)
plt.scatter(jp_sales.iloc[:,0],jp_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()


In [None]:
g_sales = df[['Global_Sales','Genre']].groupby('Genre',as_index = False).mean().sort_values(by = 'Global_Sales', ascending = True)
plt.scatter(g_sales.iloc[:,0],g_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In this analysis we can see the difference of prefebla genres according to regions and we can compare them with global sales.

In [None]:
nap_sales = df[['NA_Sales','Genre','Platform']].groupby(['Platform'],as_index = False).mean().sort_values(by = 'NA_Sales', ascending = True)
plt.scatter(nap_sales.iloc[:,0],nap_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In [None]:
eup_sales = df[['EU_Sales','Genre','Platform']].groupby(['Platform'],as_index = False).mean().sort_values(by = 'EU_Sales', ascending = True)
plt.scatter(eup_sales.iloc[:,0],eup_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In [None]:
jpp_sales = df[['JP_Sales','Genre','Platform']].groupby(['Platform'],as_index = False).mean().sort_values(by = 'JP_Sales', ascending = True)
plt.scatter(jpp_sales.iloc[:,0],jpp_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In [None]:
gb_sales = df[['Global_Sales','Genre','Platform']].groupby(['Platform'],as_index = False).mean().sort_values(by = 'Global_Sales', ascending = True)
plt.scatter(gb_sales.iloc[:,0],gb_sales.iloc[:,1])
plt.xticks(rotation = 90)
plt.show()

In this basic analysis we can see the different platforms for different regions.

## Visualization

In [None]:
df.describe()

In [None]:
sns.heatmap(df.corr(), annot = True, fmt = '.1', linewidth = .7, linecolor = 'red')
plt.show()

In [None]:
df.info()

In [None]:
games = df.Name.value_counts()
game_names = games.index
game_counts = games.values

plt.figure(figsize = (10,10))
sns.barplot(x = game_names, y = game_counts, palette = sns.cubehelix_palette(len(game_names)))
plt.show()

## Machine Learning
* LinearRegression
* Multiple Linear Regression
* Decision Tree
* Random Forest

### Linear Regression

In [None]:
plt.figure( figsize = (5,5))
plt.scatter(df.NA_Sales, df.Global_Sales)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

x = df.NA_Sales.values.reshape(-1,1)
y = df.Global_Sales.values.reshape(-1,1)

linear  = LinearRegression()
linear.fit(x, y)

In [None]:
x_ = np.arange(min(x), max(x), 0.1).reshape(-1,1)
predicted = linear.predict(x_)

from sklearn.metrics import r2_score
yhead = linear.predict(x)
print("r score: ", r2_score(y,yhead))

In [None]:
plt.figure( figsize = (5,5))
plt.scatter(x, y)
plt.plot(x_, predicted, color = 'red')
plt.show()

There is linear relationship between NA_Sales and Global_Sales as we can see earlier there is strong relationship between them.

In [None]:
x2 = df.EU_Sales.values.reshape(-1,1)
y2 = df.Global_Sales.values.reshape(-1,1)

linear2 = LinearRegression()
linear2.fit(x, y)

In [None]:
x_2 = np.arange(min(x2), max(y2), 0.1).reshape(-1,1)
predicted2 = linear2.predict(x_2)

from sklearn.metrics import r2_score
yhead = linear2.predict(x2)
print("r score: ", r2_score(y2,yhead))

In [None]:
plt.figure( figsize = (5,5))
plt.scatter(x2, y2)
plt.plot(x_2, predicted2, color = 'red')
plt.show()

In [None]:
x_mul = df.iloc[:,[5,6]]
y_mul = df.Global_Sales.values.reshape(-1,1)

mult = LinearRegression()
mult.fit(x_mul, y_mul)

In [None]:
mult.intercept_

In [None]:
mult.coef_

In [None]:
mult.predict(np.array([[10,100],[50,70]]))

from sklearn.metrics import r2_score
yhead = mult.predict(x_mul)
print("r score: ", r2_score(y_mul,yhead))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

x = df.NA_Sales.values.reshape(-1,1)
y = df.Global_Sales.values.reshape(-1,1)

tree  = DecisionTreeRegressor()
tree.fit(x, y)

In [None]:
tree.predict(np.array([[40],[50]]))

In [None]:
x_ = np.arange(min(x), max(x), 0.01).reshape(-1,1)
pred_tree = tree.predict(x_)

plt.scatter(x, y)
plt.plot(x_, pred_tree, color = 'red')
plt.show()

In [None]:
from sklearn.metrics import r2_score
yhead = tree.predict(x)
print("r score: ", r2_score(y,yhead))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

x = df.NA_Sales.values.reshape(-1,1)
y = df.Global_Sales.values.reshape(-1,1)

rf = RandomForestRegressor(n_estimators = 100, random_state = 50)
rf.fit(x, y)

In [None]:
rf.predict(np.array([[50]]))

In [None]:
x_ = np.arange(min(x), max(x), 0.01).reshape(-1,1)
pred_rf = rf.predict(x_)

plt.scatter(x, y)
plt.plot(x_, pred_rf, color = 'red')
plt.show()

In [None]:
from sklearn.metrics import r2_score

yhead = rf.predict(x)
print("r score: ", r2_score(y,yhead))