In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **1. Import packages and data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

In [None]:
# Load Dataset
df = pd.read_csv('../input/videogamesales/vgsales.csv')
df.head()

# **2. Info of dataset**

In [None]:
df.info()

In [None]:
df.isnull().sum()

We note that there are some rows with no years or with no publishers. We will drop these rows for this analysis, as this workbook depends a lot of visualizing the data by **Year** and by **Publishers**

In [None]:
# Drop rows with missing Year or missing Publisher
df.dropna(how = 'any', inplace = True)

# Convert Year from float to integer
df['Year'] = df['Year'].astype(int)
df.shape

# **3. Years in which Games are released**

First, we get the count of number of games released for each year in the dataset.

We first get the data for the top 10 years. We observe that the highest number of games (according to this dataset) released is from 2007 to 2011.

In [None]:
df.groupby('Year')['Name'].count().sort_values(ascending = False).head(10)

Bottom 10 years. We note that 2020 and 2017 are in the lowest, and this is likely due to incomplete data since this dataset is updated in 2017. Hence, we will remove data from 2017 and 2020, as they are unlikely to produce any valuable insights into the gaming industry for the years 2017 and 2020.

In [None]:
df.groupby('Year')['Name'].count().sort_values().head(10)

In [None]:
# Remove Rows for year 2017 and 2020, as these games are too new to have any useful findings
df = df[~df['Year'].isin([2017, 2020])]

In [None]:
plt.figure(figsize = (8, 6))
ax = sns.countplot(df['Year'], color = '#7FB3D5')
plt.title('No. of Games Released by Year')
plt.xticks(rotation = 90)
plt.ylabel('No. of Games Released')
plt.show()

We observe that global sales of video games started picking up rapidly from 1996 onwards, and global sales of games incresed until its peak in 2009 to 2011, before it starts declining. 

Note that the reason for the decline may be incomplete information on the games released from 2012 onwards, and may not necessarily indicate that the size of the gaming industry is dropping.

Plotting Global Sales against Years also depict a similar trend.

In [None]:
global_sales_by_year = df.groupby('Year')['Global_Sales'].sum()
plt.figure(figsize = (8, 6))
ax = sns.barplot(x = global_sales_by_year.index, y = global_sales_by_year.values, color = '#EC7063')
plt.xticks(rotation = 90)
plt.ylabel('Global Sales ($m)')
plt.show()

# **4. Based on Top 100**
First, we explore the top ranked 100 games by publisher. Nintendo has the majority with 52 out of 100 games in the top 100! Even the next highest company, Activision only has 14 games in the top 100.

In [None]:
top_100 = df.head(100)
top_100.groupby('Publisher')['Name'].count().sort_values(ascending = False)

Among the top 100 games, Nintendo already generates 920m! Activision, with only 14 games and coming in second place, only has 165.88m

In [None]:
top_100.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].sum().sort_values('Global_Sales', ascending = False)

In [None]:
sns.set_palette('muted')
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize = (12, 8))

Sales = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
ax = [ax0, ax1, ax2, ax3]

def plot_sales(Sales_sorted, ax, title):
    sns.barplot(x = Sales_sorted.index, y = Sales_sorted.values, ax = ax, ci = None)
    ax.set(ylabel = 'Total Sales', title = title)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
    for i, v in enumerate(Sales_sorted.iteritems()):
        ax.text(i ,v[1], "{:.2f}".format(v[1]), color='m', va ='bottom', rotation=45)

for Sales, ax in zip(Sales, ax):
    Sales_sorted = top_100.groupby('Publisher')[Sales].sum().sort_values(ascending = False)
    plot_sales(Sales_sorted, ax, 'Top 100 Games by Region - '  + Sales)
    
plt.tight_layout()
plt.show()

Unsurprisingly, there are different characteristics even across the major regions. Nintendo is a Japanese company and it is no surprise they are the Japanese's favorite. The other regions such as US, Europe and others, are more diversified, as other companies do have substantial share, though Nintendo still dominates the top 100.

It would therefore seem that Nintendo is extremely successful in being the favorite among consumers. We will explore this claim more when we look at the dataset in full later.

In [None]:
fig, ax = plt.subplots(1, 1)
Global_Sales = top_100.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending = False)
plot_sales(Global_Sales, ax, 'Top 100 Games Sales - Global')

**4b. Genre based on Top 100**

By looking at the genre split of the top 100 ranked games, Shooter and Platform were the top performers in the top 100.

In [None]:
fig, ax = plt.subplots(1, 1)
Global_Sales = top_100.groupby('Genre')['Global_Sales'].sum().sort_values(ascending = False)
plot_sales(Global_Sales, ax, 'Global Top 100 Games split by Genre')

For NA and EU, Shooter, Platform, Sports, Role-Playing and Action are the top genres which gamers in the region like.

However, Japan does not follow this trend. Shooter genre is one of the least popular, and Role-Playing is the most popular genre. This actually corresponds to our real world observation that many games created by Japan are Role-Playing games

In [None]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize = (12, 8))

region = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
ax = [ax0, ax1, ax2, ax3]

for region, ax in zip(region, ax):
    Sales_sorted = top_100.groupby('Genre')[region].sum().sort_values(ascending = False)
    plot_sales(Sales_sorted, ax, 'Top 100 Games by Genre - ' + region)
    
plt.tight_layout()
plt.show()

# 5. Comparing the full dataset
We now turn our attention back to the full dataset, instead of narrowing our analysis on the top 100 games. Considering that there are 576 publishers in the dataset, to allow us to look at the top players, I have set an arbitrary threshold of showing publishers with Global_Sales > 50m.

We note that Nintendo and EA have the most global sales, with 1.7b and 1.1b in Global Sales respectively.

From the same table, we also observe that Nintendo has the highest sales in NA, Europe and Japan as well compared to all the other competitors

In [None]:
all_sales = df.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].sum()
all_sales[all_sales['Global_Sales'] > 50].sort_values('Global_Sales', ascending = False)

We now look at the number of games each Publisher published.

Similar to above, an arbitrary threshold of 100 games were set so we can focus on the more established Publishers.

Unsurprisingly, EA produced the most games, which also partly explains it's second position in the global sales ranking (after all, more games released do generate more sales to some extent). On the other hand, the top performer, Nintendo, produced approximately half of EA at 696 games, signifying more sales were generated per game for Nintendo.

Also unsurprising, is that the publishers with high global sales, also had released many games as well (Activision, Ubisoft, Sony etc.) as these companies appear in the top for both lists.

In [None]:
all_count = df.groupby('Publisher')[['Global_Sales']].count()
all_count[all_count['Global_Sales'] > 100].sort_values('Global_Sales', ascending = False)

Unlike the chart for the top 100, Nintendo isn't that much ahead compared to its competitors. In fact, other companies seem to be doing comparably well too.

In [None]:
sns.set_palette('muted')
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize = (12, 10))

Sales = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
ax = [ax0, ax1, ax2, ax3]

for Sales, ax in zip(Sales, ax):
    Sales_sorted = df.groupby('Publisher')[Sales].sum().sort_values(ascending = False)[: 12] # Getting the top 12
    plot_sales(Sales_sorted, ax, 'All Games by Region - ' + Sales)
    
plt.tight_layout()
plt.show()

We now plot the total global sales by Genre, and we notice one key difference. Action was not the top genre in the top 100 games list. In totality, however, it seems that the market is more keen on the Action Genre, though not many made it into the top 100. Sports and Shooter still retain their position in the top spots.

Similarly, for Japan, Role Playing still dominates the market, and has at least twice the market share of the next most popular genre.

In [None]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize = (12, 8))

region = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
ax = [ax0, ax1, ax2, ax3]

for region, ax in zip(region, ax):
    Sales_sorted = df.groupby('Genre')[region].sum().sort_values(ascending = False)
    plot_sales(Sales_sorted, ax, 'All Games by Genre - ' + region)
    
plt.tight_layout()
plt.show()

**5b. Comparing by platforms**

First, by plotting a scatterplot of the global sales by platform against time, we are able to visualize the rise and fall of the platforms, and their popularity when they were on the market.

In [None]:
by_platforms = df.groupby(['Year', 'Platform']).sum()
by_platforms = by_platforms.reset_index()
by_platforms.drop(['Rank'], axis = 1, inplace = True)

In [None]:
cmap = sns.cubehelix_palette(start=2.8, rot=.1, as_cmap = True)

fig, ax = plt.subplots(1, 1, figsize = (12, 8))
points = ax.scatter(x = 'Year', y = 'Platform', c = 'Global_Sales', cmap = cmap, data = by_platforms)
fig.colorbar(points)
plt.xlabel('Year')
plt.ylabel('Platform')
plt.title('Global Sales (millions) by Platforms over the Years')
plt.show()

As expected, many of the platforms (except PC) have limited lifespan, with most phasing out within a decade or so. In particular, we notice that Wii (by Nintendo) was very successful in the initial 5 years, and slowly faded out. It's successor, WiiU, was not very popular.

The top platforms with highest sales can be observed to be Wii, PS3, X360, PS2, PS, DS (based on intensity of the color, with darker colors indicating higher sales). Sorting the Global Sales based on the platform confirms this.

In [None]:
by_platforms.groupby('Platform').sum().drop('Year', axis = 1).sort_values('Global_Sales', ascending = False).head(10)

**5c. Comparing by Publishers**

Let us now do some analysis based on the Publishers.

In [None]:
by_publisher = df.groupby(['Publisher'])['Global_Sales'].sum()
top_5_publisher = by_publisher.sort_values(ascending = False)[:5]
top_5_publisher = top_5_publisher.index.tolist()

by_publisher_genre = df.groupby(['Publisher', 'Genre']).sum()
by_publisher_genre.drop(['Rank', 'Year'], axis = 1, inplace = True)
by_publisher_genre.reset_index(inplace = True)
by_publisher_genre = by_publisher_genre[by_publisher_genre['Publisher'].isin(top_5_publisher)]
by_publisher_genre.head(5)

In [None]:
def sortedgroupedbar(ax, x,y, groupby, data=None, width=0.8, **kwargs):
    sns.set_palette('Set3', n_colors= 12)
    order = np.zeros(len(data))
    df = data.copy()
    for xi in np.unique(df[x].values):
        group = data[df[x] == xi]
        a = group[y].values
        b = sorted(np.arange(len(a)),key=lambda x:a[x],reverse=True)
        c = sorted(np.arange(len(a)),key=lambda x:b[x])
        order[data[x] == xi] = c   
    df["order"] = order
    u, df["ind"] = np.unique(df[x].values, return_inverse=True)
    step = width/len(np.unique(df[groupby].values))
    for xi,grp in df.groupby(groupby):
        ax.bar(grp["ind"]-width/2.+grp["order"]*step+step/2.,
               grp[y],width=step, label=xi, **kwargs)
    ax.legend(title=groupby)
    ax.set_xticks(np.arange(len(u)))
    ax.set_xticklabels(u)
    ax.set_xlabel(x)

fig, ax = plt.subplots(figsize = (12, 8))    
sortedgroupedbar(ax, x="Publisher", y="Global_Sales", groupby="Genre", data=by_publisher_genre)
plt.title('Distribution of Global Sales by Genre for the top 5 Publishers', fontsize = 14)
plt.ylabel('Global Sales (m)')
plt.show()

This chart is useful as it allows us to see which companies are known for which Genre of games. As we can see, the specialty of companies vary quite significantly. While **Activision** and **Ubisoft** both specialize in **Action, Misc, and Shooter**, companies like **EA** have a strong focus on niche in **Sports**, **Nintendo** with **Platform, Role-Playing and Sports**, and **Sony** being more popular in **Racing, Platform, and Action**.

We also observe the relative importance each company places in their genre. For example, Activision, EA and Ubisoft seems to develop their niche genre of games more heavily, and place less emphasis on the remaining genres. Nintendo and Sony, on the other hand, seems to be more diversified, having garnered quite substantial sales even for genres which are not their top.

In [None]:
df.sort_values('Global_Sales', ascending = False).groupby('Genre').head(5).sort_values('Genre')

Looking at the top 3 games per Genre, based on Global Sales, Nintendo dominates the list! Meanwhile, Action is dominated by Take-Two Interactive with their GTA franchise, while Activision dominates the Shooter genre with their Call of Duty franchise.

# 6. Based on data from 2010 to 2016
To make our visualization more up-to-date with current years, we will look at the trends in the latest years of data available from 2010, and perform the same analysis as above.

Clearly, Nintendo has lost their huge lead, and other noteable companies like Activision, EA, Ubisoft have made substantial progress into the top ranked games.

In [None]:
after2010_df = df[df['Year'] >= 2010]
after2010_df.head()

In [None]:
after2010_top_100 = after2010_df.head(100)
after2010_top_100.groupby('Publisher')['Name'].count().sort_values(ascending = False)

Again, analzying the Publisher's market share by regions, we note that **Activision** is now the largest market share in **North America, Europe and Other regions**. We also note that the market share profile for these 3 regions are pretty similar, with the top Publishers being pretty identical (though North America differs slightly based on the rankings)

Meanwhile, **Nintendo** continues to dominate sales in **Japan**, with sales at 47.32m which is nearly 10 times as the next competitor Capcom! We also note that all time favorites for other companies, like Activision, EA etc. are not popular in Japan.

In [None]:
sns.set_palette('muted')
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize = (12, 10))

Sales = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
ax = [ax0, ax1, ax2, ax3]

for Sales, ax in zip(Sales, ax):
    Sales_sorted = after2010_top_100.groupby('Publisher')[Sales].sum().sort_values(ascending = False)
    plot_sales(Sales_sorted, ax, 'Games Released After 2010 by Region - ' + Sales)
    
plt.tight_layout()
plt.show()

There isn't much changes in terms of the genre distribution after 2010, compared to the full dataset.

In [None]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize = (12, 8))

region = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
ax = [ax0, ax1, ax2, ax3]

for region, ax in zip(region, ax):
    Sales_sorted = df.groupby('Genre')[region].sum().sort_values(ascending = False)
    plot_sales(Sales_sorted, ax, 'Games Released after 2010 by Genre - ' + region)
    
plt.tight_layout()
plt.show()

In [None]:
after2010_by_publisher = after2010_df.groupby(['Publisher'])['Global_Sales'].sum()
after2010_top_5_publisher = after2010_by_publisher.sort_values(ascending = False)[:5]
after2010_top_5_publisher = after2010_top_5_publisher.index.tolist()

after2010_by_publisher_genre = after2010_df.groupby(['Publisher', 'Genre']).sum()
after2010_by_publisher_genre.drop(['Rank', 'Year'], axis = 1, inplace = True)
after2010_by_publisher_genre.reset_index(inplace = True)
after2010_by_publisher_genre = after2010_by_publisher_genre[after2010_by_publisher_genre['Publisher'].isin(after2010_top_5_publisher)]
after2010_by_publisher_genre.head(5)

There is not much changes in terms of the genres that the Publishers specialize in. However, Activision seem to have focused more on their Shooter Genre as compared to our previously analysis.

Finally, one clear distinction is that the bars for Activision and EA are much greater than Nintendo, unlike when we compared the whole dataset. This is consistent with what we have observed previously.

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))    
sortedgroupedbar(ax, x="Publisher", y="Global_Sales", groupby="Genre", data=after2010_by_publisher_genre)
plt.title('Distribution of Global Sales After 2010 by Genre for the top 5 Publishers', fontsize = 14)
plt.ylabel('Global Sales (m)')
plt.show()

Finally, the list of the top Publisher and Games for each Genre by and large remains the same players. However, we note that the Sports domain is now dominated by EA, with their FIFA franchise generating the largest sales for the genre.

In [None]:
after2010_df.sort_values('Global_Sales', ascending = False).groupby('Genre').head(5).sort_values('Genre')

# 7. Conclusion
This is my simple EDA on this dataset. Through this dataset, we are able to analyse the distribution of sales by Publisher and by Genre, and determine which were the big players from 1980 through to 2016, and from 2010 to 2016. It also let us have a glimpse on which Publishers were more popular in which Genre, which can provide for some insights if we were to perform an competitor analysis for the gaming industry. We can compare the top players for a particular Genre to determine the Publisher's direct competition. As an example, Nintendo may not be a direct competitor with Activision, since the type of games they create likely attracts different market segments. We would thus be more interested in companies that release Shooter game to make our analysis more relevant.

All in all, I believe there may be some other improvements which can be done to make this analysis more meaningful, and I would greatly appreciate any feedback, whether for the coding or for the analysis! :)

Finally, if you like my analysis, please leave an upvote :) Greatly appreciated for a beginner like me