In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from scipy import stats
from scipy.stats import norm

import warnings
warnings.filterwarnings('ignore')

## A Look into our data
    -Basically it has 9 columns to concern
    1. Name of the Company
    2. Platform for that video game
    3. Year of release
    4. Genre of the game
    5. Its Publisher
    6. Sale of that game in Millions in 
        a. Europe
        b. Japan
        c. NewYork
        d. Other part of the world.

In [None]:
df=pd.read_csv('../input/videogamesales/vgsales.csv')

df.head()

## Info about columns (dtypes,count and missing values or not)

In [None]:
df.info()

It has four object type columns and 6 Numerical columns 
and have **missing** **values** in **Year** and **Publisher** column

## Lets look into some statistics of our dataset

In [None]:
df.describe().T

-Most of the Sales columns are skewed as there is huge difference in value of 3rd quartile of our Sales column and its max value
- **We have data till Yaer 2017 but max value of Year is 2020**

## We will see how many enteries have Year as 2020 and will delete those enteries

In [None]:
df['Year'][(df.Year==2020)].count()

**Only one entry !! We will delete this entry**
**as well as we will drop Rank column**

In [None]:
df.drop(['Rank'],axis=1,inplace=True)
df=df[(df['Year']!=2020)]

## Distribution of Numerical Features

In [None]:
num_cols=['Year','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']

plt.figure(figsize=(18,10))
for i,c in enumerate(num_cols,1):
    plt.subplot(2,3,i)
    sns.distplot(df[c],fit=norm)

## Correlation Matrix

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True)

**All sales columns are highly correlated with our Target Variable (Global_Sales)!**

## Categorical Varibales EDA

In [None]:
cat_cols=['Name','Publisher','Genre','Platform']

### Checking for unique enteries in Genre,Publisher and Platform Columns

In [None]:
df['Platform'].unique()

In [None]:
df['Genre'].unique()


### Now we will examine Genre and Platform columns

## Genre

#### Checking the count of each genre in our dataset

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(df.Genre,order=df.groupby('Genre')['Name'].count().sort_values(ascending=False).index)

### Which genre contributes max to Global Sales?!

In [None]:
data_genre=df[['Global_Sales','Genre']].groupby('Genre').sum()
data_genre=data_genre.reset_index()
data_genre.sort_values(by=['Global_Sales'],ascending=False,inplace=True)

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=data_genre.Genre,y=data_genre.Global_Sales)

### Its Action games!!! 

## **Platform**

### Now we will do EDA for Platform feature to get some insight!!

* ### checking the counts of each Platform in our data

In [None]:
data_plat=df[['Platform','Name']].groupby('Platform').count().sort_values(by='Name',ascending=False).reset_index()
plt.figure(figsize=(20,10))
sns.set(font_scale=1.2)
sns.barplot(x=data_plat.Platform,y=data_plat.Name)

### DS and PS2 are in competition here!!!

* ### Checking the effecr of each platform on Global Sales


In [None]:
data_plat_sales=df[['Platform','Global_Sales']].groupby('Platform').sum().sort_values(by='Global_Sales',ascending=False).reset_index()
plt.figure(figsize=(20,10))
sns.set(font_scale=1.2)
sns.barplot(x=data_plat_sales.Platform,y=data_plat_sales.Global_Sales)

### as expected for PS2!!! but here DS goes down :( !

## Year

* ### Now we will do some Eda on Year!! 
* - lets see what we get here!!

In [None]:
data_year=df.groupby('Year')['Name'].count().sort_values(ascending=False).reset_index()
plt.figure(figsize=(20,10))
sns.set(font_scale=0.8)
sns.barplot(x=data_year.Year,y=data_year.Name,data=data_year)


### So most of our data is from 2008 and 2009 Year released!!

### Lets find out which Year has maximum Global Sales

In [None]:
data_year_sale=df[['Year','Global_Sales']].groupby('Year').sum().sort_values(by=['Global_Sales'],ascending=False).reset_index()
plt.figure(figsize=(20,10))
sns.set(font_scale=0.8)
sns.barplot(x=data_year_sale.Year,y=data_year_sale.Global_Sales,data=data_year_sale)


### As Expected!! Its 2008 and 2009!!!

## Lets check for the variation or linearity of different Sales with Global Sales

* ### NA_SALES

In [None]:
sns.lmplot(x='Global_Sales',y='NA_Sales',data=df)

* ### EU_SALES

In [None]:
sns.lmplot(x='Global_Sales',y='EU_Sales',data=df)

* ### JP_SALES

In [None]:
sns.lmplot(x='Global_Sales',y='JP_Sales',data=df)

* ### Other_SALES

In [None]:
sns.lmplot(x='Global_Sales',y='Other_Sales',data=df)

### Here we see that almost all the sales are linearly correlated with Global Sales
**-JP_Sale is slightly less linear**