In [None]:
!pip install cpi

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import cpi
cpi.update()


In [None]:
plt.style.use('fivethirtyeight')
df = pd.read_csv('../input/videogamesales/vgsales.csv')
df.shape

In [None]:
# Removing years past 2015 since dataset only supposed to contain years before 2016.
drop_row_invalid_year = df[df['Year'] > 2015].index
df = df.drop(drop_row_invalid_year)
# Removing rows with incomplete data so that it does not hamper with analysis
df = df.dropna()

In [None]:
df.shape

In [None]:
df.head()

For my analysis I am going to be using the Consumer Price Index (CPI) module to account for inflation. CPI can be defined as follows,
"The Consumer Price Index (CPI) is a measure that examines the weighted average of prices of a basket of consumer goods and services, such as transportation, food, and medical care. It is calculated by taking price changes for each item in the predetermined basket of goods and averaging them" (Investopedia).
Using the CPI will allow me to have a clearer picture of the financial successes of the games.

In [None]:
# Years need to be int type for CPI module to work
df.Year = df.Year.astype(int)

# Credit: Mike Erb, https://github.com/merb92/movie-industry-eda
def inflate_column(data, column):
    """
    Adjust for inflation the series of values in column of the
    dataframe data
    """
    return data.apply(lambda x: cpi.inflate(x[column],
                      x.Year), axis=1)


# CPI module won't update on Kaggle, so it can only convert data to 2018 values.
df['Global_Sales_CPI'] = inflate_column(df, 'Global_Sales')
df = df.round({'Global_Sales_CPI': 2})

In [None]:
df.head()

In [None]:
# category argument corresponds to the qualitative columns of the data frame.
# sales argument corresponds to the quantitative columns except for rank and year.
# bar_color accepts matplotlib colors
# vals_displayed argument takes in a integer to determine how many games should
# be graphed.

def total_sales_chart(category: str, sales: str, bar_color: str,
                      vals_displayed: int, chart_title: str):
    fig_dims = (9, 7)
    fig, ax = plt.subplots(figsize=fig_dims)

    sales_sum = df.groupby(by=[category])[sales].sum()
    top_sales_sum = sales_sum.sort_values(ascending=False)
    width1 = .5
    rect1 = ax.barh(top_sales_sum.keys()[:vals_displayed][::-1],
                    round(top_sales_sum[:vals_displayed][::-1], 2),
                    width1, color=bar_color)
    x_axis_start, x_axis_len = plt.xlim()
    x_axis_pos = x_axis_len/70
    for rect in rect1:
        width2 = rect.get_width()
        ax.text(x_axis_pos, rect.get_y() + rect.get_height()/2, width2,
                ha='left', va='center')
    ax.set_title(chart_title)


def mean_sales_chart(category: str, sales: str, bar_color: str,
                     values_displayed: int, chart_title: str):
    fig_dims = (9, 7)
    fig, ax = plt.subplots(figsize=fig_dims)

    sales_sum = df.groupby(by=[category])[sales].sum()
    games_per_category = df[category].value_counts()
    top_mean_sales = (sales_sum/games_per_category).sort_values(ascending=False)
    width1 = .5
    rect1 = ax.barh(top_mean_sales.keys()[:values_displayed][::-1],
                    round(top_mean_sales[:values_displayed][::-1], 2),
                    width1, color=bar_color)
    x_axis_start, x_axis_len = plt.xlim()
    x_axis_pos = x_axis_len/55
    for rect in rect1:
        width2 = rect.get_width()
        ax.text(x_axis_pos, rect.get_y() + rect.get_height()/2, width2,
                ha='left', va='center')
    ax.set_title(chart_title)


def games_per_category_chart(category: str, bar_color: str,
                             values_displayed: int, chart_title: str):
    fig_dims = (9, 7)
    fig, ax = plt.subplots(figsize=fig_dims)

    category_name = df[category].value_counts().keys()
    games_per_category = df[category].value_counts()
    width1 = .5
    rect1 = ax.barh(category_name[:values_displayed][::-1],
                    games_per_category[:values_displayed].sort_values(),
                    width1, color=bar_color)
    x_axis_start, x_axis_len = plt.xlim()
    x_axis_pos = x_axis_len/55
    for rect in rect1:
        width2 = rect.get_width()
        ax.text(x_axis_pos, rect.get_y() + rect.get_height()/2, width2,
                ha='left', va='center')
    ax.set_title(chart_title)


def game_sales_single_platform_chart(sales: str, bar_color: str,
                                     values_displayed: int, chart_title: str):
    fig_dims = (9, 7)
    fig, ax = plt.subplots(figsize=fig_dims)

    top_number = df.nlargest(values_displayed, sales)
    width1 = .5
    rect1 = ax.barh(top_number['Name'].iloc[::-1],
                    top_number[sales].sort_values(), width1, 
                    color=bar_color)
    x_axis_start, x_axis_len = plt.xlim()
    x_axis_pos = x_axis_len/95
    for rect in rect1:
        width2 = rect.get_width()
        ax.text(x_axis_pos, rect.get_y() + rect.get_height()/2, width2,
                ha='left', va='center')
    ax.set_title(chart_title)

In [None]:
game_sales_single_platform_chart('Global_Sales', 'royalblue', 10,'Top 10 Best Selling Video Games\n (Millons, Single Platform Sales)')

game_sales_single_platform_chart('Global_Sales_CPI',None, 10,
                            'Top 10 Best Selling Video Games Adjusted for Inflation\n (Millions, Single Platform Sales)')

When adjusting for inflation, one can see that  Duck Hunt among other games, was actually more successful for its time then one would initially assume if they were only looking at the first graph. 

In [None]:
total_sales_chart('Name', 'Global_Sales', 'royalblue', 10,
                  'Top 10 Best Selling Video Games Globally\n (Millions, Platform Sales Combined)')

total_sales_chart('Name', 'Global_Sales_CPI', None, 10,
                  'Top 10 Best Selling Video Games Globally Adjusted for Inflation\n (Millions, Platform Sales Combined)')

As can be seen above, when the sales of a game between platforms are combined, different games appear in the top ten. This gives a more accurate representation of the financial success of the games since it accounts for the fact that profits of some games are spread between different platforms.
However, when one takes a closer look at the data, three of the games in the top ten have profits that come from rereleases on newer platforms which one could consider unfair when comparing them against games that only had a singular release.
The three games in question are shown below.

In [None]:
df[df.isin(["Grand Theft Auto V"]).any(axis=1)]

In [None]:
df[df.isin(["Super Mario Bros."]).any(axis=1)]

In [None]:
df[df.isin(["Tetris"]).any(axis=1)]

In [None]:
games_per_category_chart('Publisher', 'tomato', 10, 'Number of Games per Publisher (Top Ten)')

In [None]:
total_sales_chart('Publisher', 'Global_Sales_CPI', None, 10, 
                  'Total Global Sales of Publishers Adjusted for Inflation\n (Millions, Top Ten)')
mean_sales_chart('Publisher', 'Global_Sales_CPI', 'c', 10,
                 'Mean Global Sales per Game of Publishers Adjusted for Inflation\n (Millions, Top Ten)')

As shown above, when one looks at the mean global sales per game, some lesser known publishers appear. This is due to the publishers in question making only a few, yet financially succesful games. A couple of examples are shown below.

In [None]:
df[df.isin(["Palcom"]).any(axis=1)]

In [None]:
df[df.isin(["Red Orb"]).any(axis=1)]

In [None]:
games_per_category_chart('Genre', 'tomato', None, 'Number of Games per Genre')

In [None]:
total_sales_chart('Genre', 'Global_Sales_CPI', None, None,
                  'Total Global Sales of Genres Adjusted for Inflation (Millions)')
mean_sales_chart('Genre', 'Global_Sales_CPI', 'c', None,
                 'Mean Global Sales of Genres Adjusted for Inflation (Millions)')

From the charts above, we can see that although platforming games were not the most popular in terms of games made and were not the most succesful in terms of total sales, on a per game basis, they had the most financial success.

In [None]:
games_per_category_chart('Platform', 'tomato', 10, 'Number of Games per Platform (Top Ten)')

In [None]:
total_sales_chart('Platform', 'Global_Sales_CPI', None, 10,
                  'Total Global Sales of Platforms Adjusted for Inflation (Millions, Top Ten)')
mean_sales_chart('Platform', 'Global_Sales_CPI', 'c', 10,
                 'Mean Global Sales of Platforms Adjusted for Inflation (Millions, Top Ten)')

Like in the case of the platform genre, the Nintendo Entertainment System (NES) while not having the most total financial success, on a per game basis it excelled.

In [None]:
sales_sum = df.groupby(by=['Platform'])['Global_Sales_CPI'].sum()
top_ten_platforms = list(sales_sum.sort_values(ascending=False).keys()[:10])
drop_platform_row = df[~df['Platform'].isin(top_ten_platforms)].index
top_ten_plat_df = df.drop(drop_platform_row)
# lamda function converts values to percentages in decimal format.
cross_tab_plat = pd.crosstab(top_ten_plat_df['Platform'],
                            top_ten_plat_df['Genre']).apply(lambda r: r/r.sum(), axis=1)

# Distribution of Genres of the Top Ten Platforms as a Percentage:

In [None]:
cross_tab_plat.style.background_gradient(cmap='Blues').format("{:.1%}")

In [None]:
market_sales = df[['Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
market_sales_genre = market_sales.groupby('Genre').sum()
# converts values to percentages.
market_sales_genre_percent = market_sales_genre/market_sales_genre.sum()*100

# Distribution of Sales Across Genres Between Markets as a Percentage:

In [None]:
ax1 = market_sales_genre_percent.plot.bar(figsize=(17,8), fontsize=13, width=.8)
for rec in ax1.patches:
    height = rec.get_height()
    ax1.text(rec.get_x() + rec.get_width() / 2,
             rec.get_y() + height / 2, "{:.0f}%".format(height), fontsize=8,
             ha='center', va='bottom')
plt.ylabel('Percent')

plt.show()

From the chart above, one can see that most of the genres had the same success between markets except for Japan which had a high interest in role-playing, adventure, puzzle, and strategy games and a low interest in action, shooter, racing, and sports categories compared to other markets.