In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.set()

plt.rcParams['figure.figsize'] = (15,4)
plt.rcParams['figure.dpi'] = 200

In [None]:
df = pd.read_csv('/kaggle/input/videogamesales/vgsales.csv', index_col='Rank')

In [None]:
df.head()

In [None]:
df.shape #The dataset contains 16,598 games in total with 10 different attributes for each game

Default Datatypes

In [None]:
df.dtypes

In [None]:
df['Year'] = df['Year'].convert_dtypes() # Coverting the 'Year' column from float to 'int'

In [None]:
df.head(2)

Updated Datatypes

In [None]:
df.dtypes # The 'Year' column has been converted to int type

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
# Year with the most number of releases

In [None]:
print(f"# of Years in the dataset: {df['Year'].nunique()}")
print(f"Max year in the dataset: {df['Year'].max()}")
print(f"Min year in the dataset: {df['Year'].min()}")

## Which Year had the most number of Videogame releases?

In [None]:
year_not_null = df.loc[df['Year'].notnull()]

sns.countplot(data=year_not_null, x='Year')

year_count = year_not_null['Year'].nunique()
year_values = year_not_null.groupby('Year').size().values

for index in range(year_count):
    plt.text(index,year_values[index], str(year_values[index]), ha='center', va='bottom', fontsize='small')

plt.xticks(rotation=70)
plt.title('Videogame releases by year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('');

<center>The year 2009 had the highest number of videogame releases - 1431, closely followed by 2008 which had 1428.</center>

## How was the Videogame releases distributed per genre?

In [None]:
sns.boxplot(data=df, x='Year', y='Genre')
plt.xticks(np.arange(1980,2021,2))
plt.tight_layout()
plt.title('Distribution by Year of release per Genre', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Genre', fontsize=14);

<center> The above plot indicates when most of the Games in the dataset released for each genre. We can see majority of <i>Action</i> games released between the year 2005-2012 while most of the <i>Strategy</i> games released between 2002-2009 </center>

## What were the favourite genre for different parts of the world?

In [None]:
genreGroup = df.groupby('Genre')

na_sales_by_genre = genreGroup['NA_Sales'].sum().reset_index()
eu_sales_by_genre = genreGroup['EU_Sales'].sum().reset_index()
jp_sales_by_genre = genreGroup['JP_Sales'].sum().reset_index()
global_sales_by_genre = genreGroup['Global_Sales'].sum().reset_index()

fig, axes = plt.subplots(2,2, figsize=(15,8), dpi=200, sharey=True)
fig.suptitle('Sales per Genre', fontsize=20)

sns.barplot(ax=axes[0][0], data=na_sales_by_genre, y='Genre', x='NA_Sales')
sns.barplot(ax=axes[0][1], data=eu_sales_by_genre, y='Genre', x='EU_Sales')
sns.barplot(ax=axes[1][0], data=jp_sales_by_genre, y='Genre', x='JP_Sales')
sns.barplot(ax=axes[1][1], data=global_sales_by_genre, y='Genre', x='Global_Sales')
plt.tight_layout()


fontsize = 14
labels = [['North America', 'Europe'], ['Japan', 'Global']]

for x in range(2):
    for y in range(2):
        axes[x][y].set_title(labels[x][y], fontsize=fontsize)
        axes[x][y].set_xlabel('')



From the above plots it's clear visible that <i>'Action'</i> is the largest selling category in North America, Europe and globally followed by <i>'Sports'</i>. However when it comes to Japan, <i>'Role-Playing'</i> games tops the charts with more than double the sales in comparison to <i>'Action'</i> which comes in second place.

## Which console had the most games and how well they performed in global sales?

In [None]:
fig, ax = plt.subplots(figsize=(15,5), dpi=200)

sns.countplot(ax=ax, data=df, x='Platform', color='#90A955')
sns.lineplot(ax=ax, data=df, x='Platform', y='Global_Sales', estimator=np.sum, ci=None, color='#31572C', marker='o')

plt.ylabel("Units Sold/Global Sales")
plt.ylim(0,2500)
plt.grid(True)
plt.title('Games per platform vs global sales', fontsize=20)
plt.tight_layout();

<center>The above plot draws a comparison between the number of games per platform vs the global sales aggregate per platform. We can clearly see PS2 and Nitendo DS had the highest number of games in the dataset. When it comes to global sales PS2 leads the charts followed by XBOX 360</center>

## Which publisher was the most popular based on number of releases?

In [None]:
top10_publishers = df['Publisher'].value_counts().head(10)
top10_publishers

In [None]:
sns.barplot(x=top10_publishers.values, y=top10_publishers.index, palette='viridis')
plt.title('The 10 Most popular publishers', fontsize=20)
plt.xlabel('Number of games');

<center> Most of the games in the given dataset belong to EA, followed by Activision and then Namco Bandai Games </center>