# Importing the packages

In [None]:
import pandas as pd
import matplotlib as mat
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

In [None]:
mat.rcParams.update({'figure.figsize':(20,15),'font.size':14})

# Reading the files

In [None]:
sales = pd.read_csv('../input/videogamesales/vgsales.csv')
extra_info = pd.read_csv('../input/ign-dataset/ign.csv')

# Preprocessing 

In [None]:
sales.drop('Rank',1,inplace=True)

## Missing Values

Getting the sales dataset info to check for nan values

In [None]:
sales.info()

In [None]:
sales.head()

Checking nan values in NA_Sales and EU_Sales as they showed lower number of non null values than JP_Sales, Other_Sales and Other_Sales

In [None]:
sales.loc[sales['NA_Sales'].isna()]

In [None]:
sales.loc[sales['EU_Sales'].isna()]

Fixing both as they can be infered from the other data

In [None]:
sales.loc[sales['NA_Sales'].isna(),'NA_Sales'] = sales.loc[sales['NA_Sales'].isna()][['EU_Sales','JP_Sales','Other_Sales','Global_Sales']].apply(lambda x: x[3] - sum(x[:3]),axis=1) 
sales.loc[sales['EU_Sales'].isna(),'EU_Sales'] = sales.loc[sales['EU_Sales'].isna()][['NA_Sales','JP_Sales','Other_Sales','Global_Sales']].apply(lambda x: x[3] - sum(x[:3]),axis=1) 

Recovering as much release year of the games as possible

In [None]:
sales['Name']= sales['Name'].str.lower().str.strip()
extra_info['title']= extra_info['title'].str.lower().str.strip()

Getting the first chars from each word in the platform name (abbr.)

In [None]:
extra_info["Platform-md"] = extra_info.platform.apply(lambda x: 
                              ''.join(
                                     [word[0] if word.isalpha()
                                       else word
                                        for word in re.sub(r"(\w)([A-Z])", r"\1 \2", x).split()] 
                                  # regular expression sub method takes 3 arguments the pattern, 
                                  # replacment and the original string
                                  # pattern gives says match any word char and the second match any capital char 
                                  # in the case of PlayStation it will match y and S
                                  # replacement uses backreference \1 says choose the first group (y) and \2 the 2nd (S)
                                  # says put space between the matched chars
                                    )
                            )

any platform name that is the same as the original dataset, should be updated in the modified platform in the data used for recovery

In [None]:
extra_info.loc[extra_info.platform.isin(sales["Platform"].unique()),'Platform-md'] = extra_info.loc[extra_info.platform.isin(sales["Platform"].unique())].platform

still unmatched platforms check if the abbr can be set manually

In [None]:
sales.loc[~sales.Platform.isin(extra_info["Platform-md"].unique())].Platform.unique()

Translating the erroneous abbr. platforms

In [None]:
fixing_platform={
  'NDS': 'DS',
    'N3D':'3DS',
    'WU':'WiiU',
    'X':'XB',
   'XO':'XOne',
    'A2600':'2600',
    'SNE':'SNES',
    'G':'GEN',
    'D':'DC',
    'S':'SAT',
    'TGrafx-16':'TG16'
}

In [None]:
for k in fixing_platform.keys():
     extra_info.loc[extra_info['Platform-md'] == k,'Platform-md']=fixing_platform[k]

Merging the 2 datasets

In [None]:
sales_year_recovered = pd.merge(sales,
                 extra_info.rename(columns={'title':'Name','Platform-md':'Platform'}), 
                 on = ['Name','Platform'],
                 how="left")

updating the years

In [None]:
sales_year_recovered.loc[sales_year_recovered["Year"].isna(),'Year'] = sales_year_recovered.loc[sales_year_recovered["Year"].isna(),'release_year']

In [None]:
sales.columns

drop any left rows with Nan values in the Year and Publisher columns

In [None]:
sales_year_recovered = sales_year_recovered.dropna(subset=['Year', 'Publisher'],axis=0)

remove any duplicates genrated by the merge

In [None]:
sales_year_recovered = sales_year_recovered[sales_year_recovered.columns[:15]].drop_duplicates(keep='first')

In [None]:
sales_year_recovered.head()

In [None]:
sales_year_recovered.info()

## Feature Engineering

Transforming categorical variables Genre and Publisher to dummy values

In [None]:
sales_dummies = pd.concat([sales.drop(['Genre','Publisher'],1),pd.get_dummies(sales[['Genre','Publisher']],dummy_na = False,drop_first=True)],sort=False)

Calculating the percentage of game sales in each region from the total sales

In [None]:
for col in sales.columns[5:9]:
    sales[col + '%'] = (sales[col].astype(float)/sales[sales.columns[9]].astype(float)) * 100 

In [None]:
sales.head()

Create the rank column

In [None]:
sales['Rank'] = sales.index + 1

Calculating the percentage of game global sales from the total global sales

In [None]:
sales['Global_Sales%'] = round((sales['Global_Sales']/sales['Global_Sales'].sum())*100,4)

Creating a new column for the cummulative sum of the game global sales percentages

In [None]:
sales['Global_Sales - CumSum'] = sales['Global_Sales%'].cumsum()

sales.head()

# Exploratory Analysis

## Summary Statistics

In [None]:
sales[['NA_Sales%','EU_Sales%','JP_Sales%','Other_Sales%','Global_Sales']].describe()

- North America has the highest average for games sales about 46% and std of 34% (median of 50%)
- Japan follows with average of about 24% and std of 40%
- average sales for the game is 0.537M USD and the median (0.17M) 
- 75% of the games had under 0.47M sales

In [None]:
sales.groupby('Genre')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']].sum().sort_values('Global_Sales',ascending=False)

- Here we see the action games has the highest sum of sales in comparison with other Gernre
- Sports follows it
- Strategy games had the lowest sales.

In [None]:
sales.sort_values('Global_Sales').tail(1)

In [None]:
sales.sort_values('NA_Sales').tail(1)

In [None]:
sales.sort_values('EU_Sales').tail(1)

In [None]:
sales.sort_values('JP_Sales').tail(1)

In [None]:
sales.sort_values('Other_Sales').tail(1)

In [None]:
sales_year = sales_year_recovered.groupby('Year')['Global_Sales'].sum().reset_index()

In [None]:
# top 5 years 
sales_year.sort_values(['Global_Sales']).tail()

In [None]:
sales_year['Year'] = pd.to_datetime(sales_year['Year'],format='%Y')

In [None]:
plt.figure(figsize=(15,10))
sales_year.set_index('Year').plot(grid=True)
plt.show()

- 2008 had the highest sales
- the sales were trending up before 2008 and after it started to trend down "needs more analysis in this area"

In [None]:
games_year = sales_year_recovered.groupby('Year')['Name'].count().reset_index().rename(columns={'Name':'Total Games'})

In [None]:
# top 5 years 
games_year.sort_values(['Total Games'],ascending=False).head()

In [None]:
games_year['Year'] = pd.to_datetime(games_year['Year'],format='%Y')

In [None]:
plt.figure(figsize=(15,10))
games_year.set_index('Year').plot(grid=True)
plt.show()

- The data shows that 2008 had the highest number of released games
- Same pattern is shown by the games total by year explianing how the two might be correlated 

In [None]:
sales_games_year = games_year.set_index('Year').join(sales_year.set_index('Year'))

In [None]:
sales_games_year.plot.scatter(x='Total Games', y='Global_Sales',s=40)
plt.grid()

In [None]:
sales_games_year.corr()

- High correlation between both the two time series data (#Games , Sales total) 

In [None]:
sales_games_year.plot(grid=True)

In [None]:
sales.loc[sales['Year'] == 2008].groupby('Genre')['Name'].count().sort_values()

- action, Misc and Sports were the highest Genre in this year

In [None]:
sales_genre_region = sales.groupby('Genre')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']].mean().reset_index()

In [None]:
sales_genre_region.sort_values('NA_Sales',ascending=False).head(1)

- Interestingly, the average sales per game per Genre in North Amarica is the highest in the platform games where in average the game brought more than half a million.
- interesting giving that the highest total sales in North Amarica were in Action genre

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['NA_Sales']].count().sort_values('NA_Sales').tail(1)

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['NA_Sales']].sum().sort_values('NA_Sales').tail(1)

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['NA_Sales']].count().sort_values('NA_Sales').loc['Platform']

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['NA_Sales']].sum().sort_values('NA_Sales').loc['Platform']

- Action has the highest number of games in America so when it got divided by the total sales the share of each game shrinked

In [None]:
sales_genre_region.sort_values('EU_Sales',ascending=False).head(1)

- Here Eroupe had the shooter with near half million for the game

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['EU_Sales']].count().sort_values('EU_Sales').tail(1)

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['EU_Sales']].sum().sort_values('EU_Sales').tail(1)

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['EU_Sales']].count().sort_values('EU_Sales').loc['Shooter']

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['EU_Sales']].sum().sort_values('EU_Sales').loc['Shooter']

In [None]:
sales_genre_region.sort_values('JP_Sales',ascending=False).head(1)

- Japan sticks with the Role-Playing genre with the highest average per game

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['JP_Sales']].count().sort_values('JP_Sales').tail(1)

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['EU_Sales']].sum().sort_values('EU_Sales').loc['Action']

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['JP_Sales']].count().sort_values('JP_Sales').loc['Role-Playing']

In [None]:
sales.replace({0:np.nan}).groupby('Genre')[['JP_Sales']].sum().sort_values('JP_Sales').tail(1)

In [None]:
sales_genre_region.sort_values('Other_Sales',ascending=False).head(1)

- Here other had the shooter with near 80,000 for the game

In [None]:
sales_genre_region = sales.groupby('Genre')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']].sum().reset_index()

In [None]:
sales_genre_region = sales_genre_region.set_index('Genre').rename(columns={'NA_Sales':'North America Sales',
                                                     'EU_Sales':'Europe Sales',
                                                     'JP_Sales':'Japan Sales',
                                                     'Other_Sales':'Other Sales'})

In [None]:
sns.heatmap(sales_genre_region,cmap="OrRd",annot=True,fmt=".2f")
plt.xticks(rotation=45)
plt.show()

In [None]:
sales_genre_region.sort_values('Global_Sales',ascending=False).head(1)

In [None]:
sales_platform_region = sales.groupby('Platform')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']].sum().reset_index()

In [None]:
sales_platform_region.sort_values('NA_Sales',ascending=False).head(1)

In [None]:
sales_platform_region.sort_values('EU_Sales',ascending=False).head(1)

In [None]:
sales_platform_region.sort_values('JP_Sales',ascending=False).head(1)

In [None]:
sales_platform_region.sort_values('Other_Sales',ascending=False).head(1)

In [None]:
sales_platform_region.sort_values('Global_Sales',ascending=False).head(1)

In [None]:
sales_platform_region = sales_platform_region.set_index('Platform').rename(columns={'NA_Sales':'North America Sales',
                                                     'EU_Sales':'Europe Sales',
                                                     'JP_Sales':'Japan Sales',
                                                     'Other_Sales':'Other Sales'})

In [None]:
sns.heatmap(sales_platform_region,cmap="OrRd",annot=True,fmt=".2f")
plt.xticks(rotation=45)
plt.show()

- Xbox 360 North America highest
- PS3 in Europe
- DS in Japan
- PS2 in Other areas

In [None]:
sales_melted = pd.melt(sales.rename(columns={'NA_Sales':'North America Sales',
                                                     'EU_Sales':'Europe Sales',
                                                     'JP_Sales':'Japan Sales',
                                                     'Other_Sales':'Other Sales'}), id_vars = ['Name','Genre','Platform'], value_vars = ['North America Sales','Europe Sales','Japan Sales','Other Sales'])

In [None]:
sales_melted = sales_melted.rename(columns={'variable':'Region','value':'Sales'})

In [None]:
sns.boxplot(x='Region',y='Sales', hue = 'Genre', data=sales_melted)
plt.yscale('log')
plt.grid()

In [None]:
sales['NA_Sales'].hist(bins=int(np.sqrt(len(sales))))
plt.yscale('log')

In [None]:
sales['EU_Sales'].hist(bins=int(np.sqrt(len(sales))))
plt.yscale('log')

In [None]:
sales['Global_Sales'].hist(bins=int(np.sqrt(len(sales))))
plt.yscale('log')

## Pareto Analysis

In [None]:
sales_pareto_80 = sales.loc[(sales['Global_Sales - CumSum'] <= 80)].copy()
sales_pareto_20 = sales.loc[(sales['Global_Sales - CumSum'] > 80)].copy()

In [None]:
sales['Global_Sales'].sum()*0.8

In [None]:
sales_pareto_80['Global_Sales'].sum()

In [None]:
round((len(sales_pareto_80)/len(sales))*100)

In [None]:
plt.bar([1,2],[sales_pareto_80['Global_Sales'].sum(),sales_pareto_20['Global_Sales'].sum()])
plt.xticks([1,2],['Caused by 25% of Customers','Caused by 75% of Customers'])
plt.grid()
plt.gca().set_frame_on(False)
plt.gca().text(1-0.1,sales_pareto_80['Global_Sales'].sum()+50,'80% Sales')
plt.gca().text(2-0.1,sales_pareto_20['Global_Sales'].sum()+50,'20% Sales')
    
plt.show()

In [None]:
len(sales)

- from a total of 16598 game, the 1/4 only were resposible of 80% total sales 

## 80% Sales 

In [None]:
sales_pareto_80.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False).head()

In [None]:
sales_pareto_80.groupby('Platform')['Global_Sales'].sum().sort_values(ascending=False).head()

In [None]:
sales_pareto_80.groupby('Year')['Global_Sales'].sum().sort_values(ascending=False).head()