# Performing some analytics on worldwide video games sales data.
* Evaluate sales distributions
* Evaluate categorical variables
* Global sales by platform and by genre
* Global sales over all years
* Sales development over time (global and per region)
* Top 10 bestsellers by genre and by region
* Bestsellers by year
* Bestsellers by platform

# PLAYER ONE - START

In [None]:
# packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# import data and first glance (Top 10 Games)
df = pd.read_csv('../input/worldwide-video-game-sales/vgsalesGlobale.csv')
df.head(10)

In [None]:
# dimensions
df.shape

In [None]:
# there are quite a few rows w/o year
print('Year missing: ', sum(df.Year.isna()))
# let's remove those
df = df[~df.Year.isna()]
# and convert to int
df.Year = df.Year.astype(int)

In [None]:
# show the top 10
plt.figure(figsize=(10,6))
sns.barplot(x=df.Name[0:10], y=df.Global_Sales[0:10])
plt.title('Top 10 (global sales)')
plt.grid()
plt.xticks(rotation=90)
plt.show()

In [None]:
# check that table is consistently sorted by global sales:
plt.plot(np.log10(df.Global_Sales))
plt.title('Global Sales')
plt.grid()
plt.show()

In [None]:
# Data from 2017 on looks incomplete. Let's remove those years:
df = df[df.Year<2017]

In [None]:
# Interactive plot of top games; color ~ Year, size ~ Global_Sales
# Log scale for all axes
df4plot = df[0:100] # select only top N for plot
fig = px.scatter_3d(df4plot, x='NA_Sales', y='EU_Sales', z='JP_Sales',
                    color='Year',
                    size='Global_Sales',
                    size_max = 30,
                    hover_data=['Name','Platform','Year'],
                    opacity=0.5,
                    log_x=True, log_y=True, log_z=True)
fig.update_layout(title='Top Games')
fig.show()

# Evaluate sales distributions

In [None]:
plt.hist(df.Global_Sales,100)
plt.title('Global Sales')
plt.grid()
plt.show()

In [None]:
# logarithmic version
plt.hist(np.log10(df.Global_Sales),20)
plt.title('Global Sales - log10')
plt.grid()
plt.show()

In [None]:
# pairwise scatter plot including regression lines
df_sales = df[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']].copy()
sns.pairplot(df_sales, kind='reg', plot_kws={'line_kws':{'color':'magenta'}, 'scatter_kws': {'alpha': 0.25}})
plt.show()

In [None]:
# pair plot of sales by region - apply log trafo before
df_sales['NA_log'] = np.log10(0.001+df_sales['NA_Sales']) # "cheat" a little bit (add 0.001) to avoid log(0)-issue
df_sales['EU_log'] = np.log10(0.001+df_sales['EU_Sales'])
df_sales['JP_log'] = np.log10(0.001+df_sales['JP_Sales'])
df_sales['Other_log'] = np.log10(0.001+df_sales['Other_Sales'])

sns.pairplot(df_sales[['NA_log','EU_log','JP_log','Other_log']], kind='scatter',
            plot_kws = {'alpha': 0.25})
plt.show()

# Evaluate categorial variables (frequency)

In [None]:
# aux function for evaluation of categorical variables
def categorical_eval(df, feature, add_text):
    freqs = df[feature].value_counts()
    print(freqs)
    plt.figure(figsize=(10,6))
    sns.barplot(x=freqs.index, y=freqs.values)
    plt.title(feature + add_text)
    plt.grid()
    plt.xticks(rotation=90)
    plt.show()

In [None]:
categorical_eval(df, 'Platform', ' - Frequencies')

In [None]:
categorical_eval(df, 'Genre', ' - Frequencies')

In [None]:
categorical_eval(df, 'Year', ' - Frequencies')

# Global sales by Platform

In [None]:
sales_by_platform = df.groupby(by='Platform').Global_Sales.sum()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=sales_by_platform.index, y=sales_by_platform.values)
plt.title('Global Sales by Platform')
plt.grid()
plt.xticks(rotation=90)
plt.show()

# Global Sales by Genre

In [None]:
sales_by_genre = df.groupby(by='Genre').Global_Sales.sum()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=sales_by_genre.index, y=sales_by_genre.values)
plt.title('Global Sales by Genre')
plt.grid()
plt.xticks(rotation=90)
plt.show()

# Global sales over all years

In [None]:
NA_total = df.NA_Sales.sum()
EU_total = df.EU_Sales.sum()
JP_total = df.JP_Sales.sum()
Other_total = df.Other_Sales.sum()
Global_total = NA_total + EU_total + JP_total + Other_total

print("Total sales NA    :", round(NA_total,2), ' ~ ', round(100*NA_total/Global_total,2), '%')
print("Total sales EU    :", round(EU_total,2), ' ~ ', round(100*EU_total/Global_total,2), '%')
print("Total sales JP    :", round(JP_total,2), ' ~ ', round(100*JP_total/Global_total,2), '%')
print("Total sales Other : ", round(Other_total,2), ' ~  ', round(100*Other_total/Global_total,2), '%')
print("Total sales Global:", round(Global_total,2))

In [None]:
plt.figure(figsize=(10,6))
plt.pie([NA_total, EU_total, JP_total, Other_total], labels=['North America','Europe','Japan','Other'],
       autopct='%1.2f%%', shadow=True)
plt.title('Total Sales by region')
plt.show()

# Global sales development

In [None]:
global_sales = df.groupby(by='Year').Global_Sales.sum()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=global_sales.index, y=global_sales.values)
plt.title('Global Sales by Year')
plt.grid()
plt.xticks(rotation=90)
plt.show()

### Split by region

In [None]:
df_sales_split_by_year = df.groupby(
     ['Year']
 ).agg(
     NorthAmerica = ('NA_Sales','sum'),
     Europe = ('EU_Sales','sum'),
     Japan = ('JP_Sales','sum'),
     Other = ('Other_Sales','sum')
 ).reset_index()

In [None]:
df_sales_split_by_year

In [None]:
# plot sales development by region
plt.figure(figsize=(10,6))
df_sales_4plot = df_sales_split_by_year.melt('Year', var_name='Region', value_name='Sales')
sns.lineplot(x=df_sales_4plot.Year, y=df_sales_4plot.Sales, hue=df_sales_4plot.Region)
plt.title('Sales development by region')
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# alternative visualization: stacked bars
plt.figure(figsize=(10,6))
p1=plt.bar(df_sales_split_by_year.Year, df_sales_split_by_year.NorthAmerica)
bot = df_sales_split_by_year.NorthAmerica
p2=plt.bar(df_sales_split_by_year.Year, df_sales_split_by_year.Europe, bottom=bot)
bot = bot+df_sales_split_by_year.Europe
p3=plt.bar(df_sales_split_by_year.Year, df_sales_split_by_year.Japan, bottom=bot)
bot = bot+df_sales_split_by_year.Japan
p4=plt.bar(df_sales_split_by_year.Year, df_sales_split_by_year.Other, bottom=bot)
plt.title('Sales development by region')
plt.legend((p1[0],p2[0],p3[0],p4[0]), ('North America', 'Europe', 'Japan', 'Other'))
plt.grid()
plt.show()

# Top 10 Bestsellers by Genre

In [None]:
genre_list = list(df['Genre'].value_counts().index)
genre_list

In [None]:
for genre in genre_list:
    top = df[df.Genre==genre]
    top = top[0:10]
    display(top)

    top['Name_Unique'] = top.Name + ' | ' + top.Platform # we need a unique name for plot!
    plt.figure(figsize=(10,6))
    sns.barplot(x='Name_Unique', y='Global_Sales', data=top)
    plt.title(genre)
    plt.grid()
    plt.xticks(rotation=90)
    plt.show()

# Top 10 Bestsellers by Region

### North America

In [None]:
df_NA = df.sort_values(by='NA_Sales', ascending=False)
df_NA = df_NA.reset_index(drop=True)
df_NA.index = df_NA.index + 1 # start index with 1 => index = (local) rank
df_NA.head(10)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=df_NA.Name[0:10], y=df_NA.NA_Sales[0:10])
plt.title('North America - Top 10')
plt.grid()
plt.xticks(rotation=90)
plt.show()

### Europe

In [None]:
df_Europe = df.sort_values(by='EU_Sales', ascending=False)
df_Europe = df_Europe.reset_index(drop=True)
df_Europe.index = df_Europe.index + 1 # start index with 1 => index = (local) rank
df_Europe.head(10)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=df_Europe.Name[0:10], y=df_Europe.EU_Sales[0:10])
plt.title('Europe - Top 10')
plt.grid()
plt.xticks(rotation=90)
plt.show()

### Japan

In [None]:
df_Japan = df.sort_values(by='JP_Sales', ascending=False)
df_Japan = df_Japan.reset_index(drop=True)
df_Japan.index = df_Japan.index + 1 # start index with 1 => index = (local) rank
df_Japan.head(10)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=df_Japan.Name[0:10], y=df_Japan.JP_Sales[0:10])
plt.title('Japan - Top 10')
plt.grid()
plt.xticks(rotation=90)
plt.show()

* Wow, "Monster Hunter Freedom 3" jumps from 215 worldwide in the Top 10 in Japan!
* Furthermore Role-Playing is extremely popular in Japan (6 out of Top 10). 
* Surprisingly, we do not see the global leader "Wii Sports" in the Japan Top 10!

In [None]:
# Let's check
df_Japan[df_Japan.Name=='Wii Sports']

#### "Wii Sports" is only on 26th place in Japan!

### Other

In [None]:
df_Other = df.sort_values(by='Other_Sales', ascending=False)
df_Other = df_Other.reset_index(drop=True)
df_Other.head(10)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=df_Other.Name[0:10], y=df_Other.Other_Sales[0:10])
plt.title('Other - Top 10')
plt.grid()
plt.xticks(rotation=90)
plt.show()

#### Yet another suprise: "Pro Evolution Soccer 2008" jumps from 349 worldwide in the Top 10 in "Other"!

# Bestsellers by Year

In [None]:
for year in range(1980,2016+1):
    top = df[df.Year==year]
    display(top.head(1))

# Bestsellers by Platform

In [None]:
platform_list = list(df['Platform'].value_counts().index)

In [None]:
for pf in platform_list:
    top = df[df.Platform==pf]
    display(top.head(1))