## Description

This dataset contains a list of video games with sales greater than 100,000 copies. It was generated by a scrape of vgchartz.com.

Fields include

    * Name - The games name

    * Platform - Platform of the games release (i.e. PC,PS4, etc.)

    * Year - Year of the game's release

    * Genre - Genre of the game

    * Publisher - Publisher of the game

    * NA_Sales - Sales in North America (in millions)

    * EU_Sales - Sales in Europe (in millions)

    * JP_Sales - Sales in Japan (in millions)

    * Other_Sales - Sales in the rest of the world (in millions)

    * Global_Sales - Total worldwide sales.

## import packages and themes

In [None]:
# data manipulation
import pandas as pd 
import numpy as np
import os

# data vizualisation 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# dataprep
!pip install dataprep
from dataprep.eda import *
from dataprep.datasets import load_dataset
from dataprep.eda import create_report
from dataprep.eda import plot
from dataprep.eda import plot_correlation
from dataprep.eda.missing import plot_missing

#default theme
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[15,10]
matplotlib.rcParams.update({'font.size': 15})




In [None]:
import warnings
warnings.filterwarnings("ignore")

# 1. data analysis

In [None]:
df = pd.read_csv('../input/videogamesales/vgsales.csv',index_col='Rank')
df

i change the index column to "Rank" to reduce the number of columns in our data

In [None]:
print('numebr of rows is :',df.shape[0],' and number of columns is : ',df.shape[1])

In [None]:
df.columns

In [None]:
df.info()

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(20, 8))

df.dtypes.value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True,ax=axarr[1])
plt.title('type of our data')

df.dtypes.value_counts().plot(kind='bar',ax=axarr[0])
plt.title('type of our data');

In [None]:
plot(df.dtypes.value_counts())

In [None]:
df.describe(include='all')

from describe methode we can cee a lot of things :
    * our periode is between 	1980 and 2020
    * the famaous platform is DS (2163 time)
    * the top genre of games are Action
    * the top Publisher is Electronic Arts 

# data corrolation

In [None]:
plt.figure(figsize=(13,10))
sns.heatmap(df.corr(), cmap = "Blues", annot=True, linewidth=3)

# Data visualization

# A. Games Names

In [None]:
matplotlib.rcParams['figure.figsize'] =[15,10]
df.Name.value_counts().head(100).plot(kind='bar')
plt.title('top 100 game sales')

# B. platforms

In [None]:
sns.pairplot(df, hue='Platform')

In [None]:
df.Platform.unique()

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(20, 8))

df.Platform.value_counts().plot.pie(shadow=True,ax=axarr[1])


df.Platform.value_counts().plot(kind='bar',ax=axarr[0])
plt.title('all the platforms');

### the famous platforms are DS and PS2 with more then 2000 time

### Which platfrom have the highest sale price ?


In [None]:
data_NA_Sales = df.groupby(by=['Platform'])['NA_Sales'].sum()
data_NA_Sales = data_NA_Sales.reset_index()
data_NA_Sales = data_NA_Sales.sort_values(by=['NA_Sales'], ascending=False)

data_EU_Sales = df.groupby(by=['Platform'])['EU_Sales'].sum()
data_EU_Sales = data_EU_Sales.reset_index()
data_EU_Sales = data_EU_Sales.sort_values(by=['EU_Sales'], ascending=False)

data_JP_Sales = df.groupby(by=['Platform'])['JP_Sales'].sum()
data_JP_Sales = data_JP_Sales.reset_index()
data_JP_Sales = data_JP_Sales.sort_values(by=['JP_Sales'], ascending=False)

data_platform = df.groupby(by=['Platform'])['Global_Sales'].sum()
data_platform = data_platform.reset_index()
data_platform = data_platform.sort_values(by=['Global_Sales'], ascending=False)


In [None]:
fig, axarr = plt.subplots(2, 2, figsize=(30, 15))

sns.barplot(x="Platform", y="Global_Sales", data=data_platform,
    ax=axarr[0][0])
axarr[0][0].set_title("platfrom have the highest Global_Sales price", fontsize=18)


sns.barplot(x="Platform", y="NA_Sales", data=data_NA_Sales,
    ax=axarr[1][0])
axarr[1][0].set_title("platfrom have the highest NA_Sales price", fontsize=18)


sns.barplot(x="Platform", y="EU_Sales", data=data_EU_Sales,
    ax=axarr[0][1])
axarr[0][1].set_title("platfrom have the highest EU_Sales price", fontsize=18)


sns.barplot(x="Platform", y="JP_Sales", data=data_JP_Sales,
    ax=axarr[1][1])
axarr[1][1].set_title("platfrom have the highest JP_Sales price", fontsize=18)


plt.subplots_adjust(hspace=.3)

import seaborn as sns
sns.despine()

In [None]:
comp_platform = df[['Platform', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
comp_platform.head()

In [None]:
comp_table = pd.melt(comp_platform, id_vars=['Platform'], value_vars=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], var_name='Sale_Area', value_name='Sale_Price')
comp_table.head()

In [None]:
plt.figure(figsize=(30, 15))
sns.barplot(x='Platform', y='Sale_Price', hue='Sale_Area', data=comp_table)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

# C. Genre

In [None]:
sns.pairplot(df, hue='Genre')

In [None]:
df.Genre.unique()

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(20, 8))

df.Genre.value_counts().plot.pie(shadow=True,ax=axarr[1])


df.Genre.value_counts().plot(kind='bar',ax=axarr[0])
plt.title('all the games Genre');

### action and sports games are the moste famous game gender in our data

### Which Game Genre have the highest sale price ?

In [None]:
data_NA_Sales = df.groupby(by=['Genre'])['NA_Sales'].sum()
data_NA_Sales = data_NA_Sales.reset_index()
data_NA_Sales = data_NA_Sales.sort_values(by=['NA_Sales'], ascending=False)

data_EU_Sales = df.groupby(by=['Genre'])['EU_Sales'].sum()
data_EU_Sales = data_EU_Sales.reset_index()
data_EU_Sales = data_EU_Sales.sort_values(by=['EU_Sales'], ascending=False)

data_JP_Sales = df.groupby(by=['Genre'])['JP_Sales'].sum()
data_JP_Sales = data_JP_Sales.reset_index()
data_JP_Sales = data_JP_Sales.sort_values(by=['JP_Sales'], ascending=False)

data_platform = df.groupby(by=['Genre'])['Global_Sales'].sum()
data_platform = data_platform.reset_index()
data_platform = data_platform.sort_values(by=['Global_Sales'], ascending=False)

In [None]:
fig, axarr = plt.subplots(2, 2, figsize=(30, 15))

sns.barplot(x="Genre", y="Global_Sales", data=data_platform,
    ax=axarr[0][0])
axarr[0][0].set_title("game Genre who have the highest Global_Sales price", fontsize=18)


sns.barplot(x="Genre", y="NA_Sales", data=data_NA_Sales,
    ax=axarr[1][0])
axarr[1][0].set_title("game Genre who have the highest NA_Sales price", fontsize=18)


sns.barplot(x="Genre", y="EU_Sales", data=data_EU_Sales,
    ax=axarr[0][1])
axarr[0][1].set_title("game Genre who have the highest EU_Sales price", fontsize=18)


sns.barplot(x="Genre", y="JP_Sales", data=data_JP_Sales,
    ax=axarr[1][1])
axarr[1][1].set_title("game Genre who have the highest JP_Sales price", fontsize=18)


plt.subplots_adjust(hspace=.3)

sns.despine()

### Sales compearison by genre

In [None]:
comp_genre = df[['Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
comp_genre


In [None]:
comp_map = comp_genre.groupby(by=['Genre']).sum()
comp_map

In [None]:
plt.figure(figsize=(15, 10))
sns.set(font_scale=1)
sns.heatmap(comp_map, annot=True, fmt = '.1f')

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
comp_table = comp_map.reset_index()
comp_table = pd.melt(comp_table, id_vars=['Genre'], value_vars=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], var_name='Sale_Area', value_name='Sale_Price')
comp_table.head(20)

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x='Genre', y='Sale_Price', hue='Sale_Area', data=comp_table)

### Action Sports and Shooter have much sales compare with others. and North America (NA_sales) have heights sales all the time. Its look like they love video games most

# D. Publisher

In [None]:
df.Publisher.unique()

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(20, 8))

df.Publisher.value_counts().head(30).plot.pie(shadow=True,ax=axarr[1])


df.Publisher.value_counts().head(30).plot(kind='bar',ax=axarr[0])
plt.title('all the Publisher');

PS : we take just the first 30 Publisher because we had more then 578 and that will make hard to visualate 

like we see the moste 5 famous Publisher are :

    * Electronic Arts                 1351
    * Activision                       975
    * Namco Bandai Games               932
    * Ubisoft                          921
    * Konami Digital Entertainment     832

In [None]:
# PS : we had arround 203 rows with Publisher is "Unknown"
df.loc[df.Publisher =="Unknown"]

In [None]:
comp_publisher = df[['Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
comp_publisher.head()

In [None]:
comp_publisher = comp_publisher.groupby(by=['Publisher']).sum().reset_index().sort_values(by=['Global_Sales'], ascending=False)
comp_publisher = comp_publisher.head(20)

comp_publisher = pd.melt(comp_publisher, id_vars=['Publisher'], value_vars=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], var_name='Sale_Area', value_name='Sale_Price')


In [None]:
plt.figure(figsize=(30, 15))
sns.barplot(x='Publisher', y='Sale_Price', hue='Sale_Area', data=comp_publisher)
plt.xticks(fontsize=14, rotation=90)
plt.yticks(fontsize=14)
plt.show()

### Top publisher by Count each year !!

In [None]:
top_publisher =  df[['Year', 'Publisher']]
top_publisher_df = top_publisher.groupby(by=['Year', 'Publisher']).size().reset_index(name='Count')
top_publisher_idx =  top_publisher_df.groupby(by=['Year'])['Count'].transform(max) == top_publisher_df['Count']
top_publisher_count = top_publisher_df[top_publisher_idx].reset_index(drop=True)
top_publisher_count  = top_publisher_count.drop_duplicates(subset=["Year", "Count"], keep='last').reset_index(drop=True)

publisher= top_publisher_count['Publisher']

In [None]:
plt.figure(figsize=(30, 15))
g = sns.barplot(x='Year', y='Count', data=top_publisher_count)
index = 0
for value in top_publisher_count['Count'].values:
    g.text(index, value + 5, str(publisher[index] + '----' +str(value)), color='#000', size=14, rotation= 90, ha="center")
    index += 1
plt.xticks(rotation=90)
plt.show()

# E. Sales

In [None]:
fig, axarr = plt.subplots(3, 2, figsize=(20,15))

sns.distplot(df['NA_Sales'],  kde=False, color='mediumvioletred',
    ax=axarr[0][0]
)


sns.distplot(df['EU_Sales'],  kde=False,color='mediumvioletred',
    ax=axarr[1][0]
)


sns.distplot(df['JP_Sales'],  kde=False,color='mediumvioletred',
    ax=axarr[0][1]
)


sns.distplot(df['Other_Sales'],  kde=False,color='mediumvioletred',
    ax=axarr[1][1]
)


sns.distplot(df['Global_Sales'],  kde=False,color='mediumvioletred',
    ax=axarr[2][0]
)


plt.subplots_adjust(hspace=.5)

sns.despine()

In [None]:
from scipy import stats

In [None]:
data_hist_log = df.copy()

In [None]:
data_hist_log = data_hist_log[data_hist_log.NA_Sales != 0]
data_hist_log = data_hist_log[data_hist_log.EU_Sales != 0]
data_hist_log = data_hist_log[data_hist_log.Other_Sales != 0]
data_hist_log = data_hist_log[data_hist_log.JP_Sales != 0]
data_hist_log = data_hist_log[data_hist_log.Global_Sales != 0]

plt.figure(figsize=(25,30))
sales_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
for i, column in enumerate(sales_columns):
    plt.subplot(3,2,i+1)
    sns.distplot(np.log(data_hist_log[column]), bins=20, kde=False, fit=stats.gamma,color='mediumvioletred')


In [None]:
create_report(df)

## finding missing values

In [None]:
missing_values=df.isnull().sum() # missing values

percent_missing = df.isnull().sum()/df.shape[0]*100 # missing value %

value = {
    'missing_values ':missing_values,
    'percent_missing %':percent_missing  
}
frame=pd.DataFrame(value)
frame


In [None]:
plot_missing(df)

### lets try to chech the Publisher missing rows 

In [None]:
null_data = df[df.isnull().any(axis=1)]
null_data

like we see 307 rows had at least 1 missing values

## If you like this note book please give me an up vote
#### Thank You ..............