In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Video Game Sales Analysis

In [None]:
df = pd.read_csv('../input/videogamesales/vgsales.csv')
df.head(10)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
missing_vals = df.isnull().sum().sum()
print('% of missing values are : ', missing_vals/len(df.index)*100)

As the missing values are very much less as compared to the total values, we can delete them without significant loss of data

In [None]:
df = df.dropna()
df.info()

In [None]:
df['Year'] = df['Year'].astype('int')

In [None]:
df.info()

In [None]:
print(df['Platform'].value_counts())
print('\n\n')
platforms = df['Platform'].nunique()
print('Total no of platforms: {}'.format(platforms))

There are 31 different platforms. DS, PS2,PS3,Wii,X360 being most popular

In [None]:
print(df['Genre'].value_counts())
print('\n\n')
genres = df['Genre'].nunique()
print('Total no of genres: {}'.format(genres))

There are 12 kinds of genres. Action, Sports, Misc, Role playing, Shooter are the most common

Using **Heatmap** finding the correlation among variables

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(),annot=True, cmap='Blues')
plt.show()

## As can be seen from above heatmap<br>
* Rank has almost zero correlation with other variables
* 'Year' variable don't make sense here
* Global Sales are highly dependent on North American and European sales, as sales in north america and europe increases, global sales also tend to increase
* Also, global sales depend fairly on Japanese sales and other sales


## Pairplot</br>
Excluding 'Year' and 'Rank'

In [None]:
sns.pairplot(df.drop(['Year','Rank'],axis=1))
plt.show()

## Year-wise game releases

In [None]:
df.groupby('Year')['Name']

## Year-wise Global Sales

In [None]:
df.groupby('Year')['Global_Sales'].sum()

From the above Series, we see that after 2017, direct data of 2020 is given. As it is very unlikely that no video games got released in 2018, 2019. Hence we can say that game whose 'Year' is 2020 got printed incorrectly

In [None]:
df[df['Year']==2020]

This game is classified incorrectly. Changing its year to 2009

In [None]:
df['Year'] = df['Year'].replace(2020,2009)

In [None]:
df[df['Year']==2020]

In [None]:
plt.figure(figsize=(16,8))
df.groupby('Year')['Global_Sales'].sum().plot.bar()
plt.ylabel('Sales (Millions)')
plt.show()

## Genre-wise Global Sales

In [None]:
plt.figure(figsize=(10,5))
df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False).plot.bar()
plt.ylabel('Sales (Millions)')
plt.show()

Mostly, people prefer **Action** genre games over any other games

## Platform wise global Sales

In [None]:
plt.figure(figsize=(10,6))
# considering top 10 platforms
df.groupby('Platform')['Global_Sales'].sum().sort_values(ascending=False).head(10).plot.bar()
plt.ylabel('Sales (Millions)')
plt.show()

Games released on **PS2** platform have higher global sales as compared with any other platforms

## Publisher-wise global sales

In [None]:
plt.figure(figsize=(10,6))
df.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending=False).head(10).plot.bar()
plt.ylabel('Sales (Millions)')
plt.show()

**Nintendo** has the maximum global sales as compared to any other publishers

# What's the most popular Game, Genre, Publisher and Platform of all times?

In [None]:
regions = ['NA_Sales','EU_Sales','JP_Sales']
aspects = ['Platform','Genre','Publisher','Name']
for i in regions:
    for j in aspects:
        val = df.groupby(j)[i].sum().sort_values(ascending=False).head(1)
        display(val)

From the above results, it can be concluded that </br>
* In North America\
**X360** Platform is most popular\
**Action genre** is most popular along with Nintendo publisher\
**Wii Sports** is the most popular game\


* In Europe\
**PS3** platform is the most popular and most popular genre, publisher and game being same as North America


* In Japan\
**DS** platform is the most popular
**Role Playting Games (RPGs)** are the most popular genre
**Nintendo** is the most popular publisher along with **Pokemon Red/Pokemon Blue** being the mos popular game title

# Top 5 games from each region

## For North America

In [None]:
df.groupby('Name')['NA_Sales'].sum().sort_values(ascending=False).head(5)

## For Europe

In [None]:
df.groupby('Name')['EU_Sales'].sum().sort_values(ascending=False).head(5)

## For Japan

In [None]:
df.groupby('Name')['JP_Sales'].sum().sort_values(ascending=False).head(5)