# <center>Video Game Sales Exploratory Data Analysis

* All Sales have M prefix.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import numpy as np
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/videogamesales/vgsales.csv")
data.head()

# Checking for Missing Values

In [None]:
data.isna().sum()

**Year and Publisher features have missing data points. I will drop the particular rows that have missing values.**

In [None]:
data = data.dropna(subset=['Publisher', 'Year'], axis=0)
data = data.reset_index(drop=True)
data.isna().sum()

**Looking good now!**

In [None]:
data.info()

**Year feature has wrong data type. Therefore, I converted it to integer data type for further analysis.**

In [None]:
# Converting float year type to int
data['Year'] = data['Year'].astype(int)
data['Year'].dtype

# Data Visualization

# Number of Games Published Annually

In [None]:
AnnualNumberOfGames = data['Year'].groupby(data['Year']).count()

fig = px.line(AnnualNumberOfGames, x=AnnualNumberOfGames.index, y=AnnualNumberOfGames,
              labels={
                  "index": "Year",
                  "y": "Number of Games Published"
              }
              )
fig.update_layout(title_text='Number of Games Published Annually',
                  title_x=0.5, title_font=dict(size=24))

fig.show()

**According to the graph above, the number of games published annually is increasing significantly after the 2000s. This has the top points in the year of 2009.**

# Global Video Game Sales Annually

In [None]:
AnnualSales = data.groupby('Year')['Global_Sales'].sum().reset_index()
fig = px.line(AnnualSales, x=AnnualSales['Year'], y=AnnualSales['Global_Sales'],
              labels={
                  "index": "Year",
                  "Global_Sales": "Global Sales (M)"
              }
              )
fig.update_layout(title_text='Global Video Game Sales Annually',
                  title_x=0.5, title_font=dict(size=24))
fig.show()

**According to the graph above, it is obvious that the Global Video Game Sales Annually looks like the graph of the Number of Games Published Annually. In the year 2009, it was the highest amount of video games published. In contrast, the most number of games sold in the year 2008.**

# Video Game Sales for each Market Annually


In [None]:
AnnualSalesMarket = data.groupby('Year')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum().reset_index()

fig = go.Figure()
fig.add_trace(go.Scatter(x=AnnualSalesMarket['Year'], y=AnnualSalesMarket['NA_Sales'],
                         name="North America Sales",
                         hovertext=AnnualSalesMarket['NA_Sales']))

fig.add_trace(go.Scatter(x=AnnualSalesMarket['Year'], y=AnnualSalesMarket['EU_Sales'],
                         name="Europe Sales",
                         hovertext=AnnualSalesMarket['EU_Sales']))

fig.add_trace(go.Scatter(x=AnnualSalesMarket['Year'], y=AnnualSalesMarket['JP_Sales'],
                         name="Japan Sales",
                         hovertext=AnnualSalesMarket['JP_Sales']))

fig.add_trace(go.Scatter(x=AnnualSalesMarket['Year'], y=AnnualSalesMarket['Other_Sales'],
                         name="Other Sales",
                         hovertext=AnnualSalesMarket['Other_Sales']))

fig.update_layout(title_text='Video Game Sales for each Market Annually',
                  title_x=0.5, title_font=dict(size=22))  # Location and the font size of the main title
fig.update_layout(
    xaxis_title="Years",
    yaxis_title="Sales (M)")

fig.show()

# Top 15 Publishers that have the highest Global Sales


In [None]:
PublisherTotalGames = data['Global_Sales'].groupby(data['Publisher']).sum().sort_values(ascending=False).to_frame()
PublisherTotalGames_top = PublisherTotalGames.nlargest(15, 'Global_Sales')[['Global_Sales']]

fig = px.bar(data_frame=PublisherTotalGames_top, x=PublisherTotalGames_top.index, y='Global_Sales', color=PublisherTotalGames_top.index)
fig.update_layout(title_text='Top 15 Publishers that have the highest Global Sales',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**Nintendo has the highest global sales. Electronic Arts and Activation follows them.**

# Top 15 most Sold Games - Global


In [None]:
GameGlobalSales = data['Global_Sales'].groupby(data['Name']).sum().sort_values(ascending=False).to_frame()
GameGlobalSales_top = GameGlobalSales.nlargest(15, 'Global_Sales')[['Global_Sales']]

fig = px.bar(data_frame=GameGlobalSales_top, x=GameGlobalSales_top.index, y='Global_Sales', color=GameGlobalSales_top.index)
fig.update_layout(title_text='Top 15 most Sold Games - Global All Time',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()


**In the Global Market, Wii Sports and GTA5 sold highest among other games. Following games almost have same sales.**

# Top 15 most Sold Games - North America


In [None]:
GameGlobalSales = data['NA_Sales'].groupby(data['Name']).sum().sort_values(ascending=False).to_frame()
GameGlobalSales_top = GameGlobalSales.nlargest(15, 'NA_Sales')[['NA_Sales']]

fig = px.bar(data_frame=GameGlobalSales_top, x=GameGlobalSales_top.index, y='NA_Sales', color=GameGlobalSales_top.index)
fig.update_layout(title_text='Top 15 most Sold Games - North America',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()


**In the North America Market, we see that Wii Sports still has the leadership. Super Marios Bros. and Duck Hunt follows the Wii Sports.**

# Top 15 most Sold Games - Europe


In [None]:
GameGlobalSales = data['EU_Sales'].groupby(data['Name']).sum().sort_values(ascending=False).to_frame()
GameGlobalSales_top = GameGlobalSales.nlargest(15, 'EU_Sales')[['EU_Sales']]

fig = px.bar(data_frame=GameGlobalSales_top, x=GameGlobalSales_top.index, y='EU_Sales', color=GameGlobalSales_top.index)
fig.update_layout(title_text='Top 15 most Sold Games - Europe',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**In the Europe Market, Wii Sports still has the highest sales. In contrast to the other markets, FIFA games showed up in the list.**

# Top 15 most Sold Games - Japan


In [None]:
GameGlobalSales = data['JP_Sales'].groupby(data['Name']).sum().sort_values(ascending=False).to_frame()
GameGlobalSales_top = GameGlobalSales.nlargest(15, 'JP_Sales')[['JP_Sales']]

fig = px.bar(data_frame=GameGlobalSales_top, x=GameGlobalSales_top.index, y='JP_Sales', color=GameGlobalSales_top.index)
fig.update_layout(title_text='Top 15 most Sold Games - Japan',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

# Top 15 most Sold Games - Other


In [None]:
GameGlobalSales = data['Other_Sales'].groupby(data['Name']).sum().sort_values(ascending=False).to_frame()
GameGlobalSales_top = GameGlobalSales.nlargest(15, 'Other_Sales')[['Other_Sales']]

fig = px.bar(data_frame=GameGlobalSales_top, x=GameGlobalSales_top.index, y='Other_Sales', color=GameGlobalSales_top.index)
fig.update_layout(title_text='Top 15 most Sold Games - Other',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()


**In the Others Market, GTA: SA is top seller (it was my favorite game when I was kid :))**

# Top 15 most Sold Platforms - Global


In [None]:
PlatformGlobalSales = data['Global_Sales'].groupby(data['Platform']).sum().sort_values(ascending=False).to_frame()
PlatformGlobalSales = PlatformGlobalSales.nlargest(15, 'Global_Sales')[['Global_Sales']]

fig = px.bar(data_frame=PlatformGlobalSales, x=PlatformGlobalSales.index, y='Global_Sales', color=PlatformGlobalSales.index)
fig.update_layout(title_text='Top 15 most Sold Platforms - Global',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**PS2 is the top seller for the Global Market. Respect for that!**

# Top 15 most Sold Platforms - North America


In [None]:
PlatformGlobalSales = data['NA_Sales'].groupby(data['Platform']).sum().sort_values(ascending=False).to_frame()
PlatformGlobalSales = PlatformGlobalSales.nlargest(15, 'NA_Sales')[['NA_Sales']]

fig = px.bar(data_frame=PlatformGlobalSales, x=PlatformGlobalSales.index, y='NA_Sales', color=PlatformGlobalSales.index)
fig.update_layout(title_text='Top 15 most Sold Platforms - North America',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**According to the North America Market sales, the X360 and PS2 have pretty close sales.**

# Top 15 most Sold Platforms - Europe


In [None]:
PlatformGlobalSales = data['EU_Sales'].groupby(data['Platform']).sum().sort_values(ascending=False).to_frame()
PlatformGlobalSales = PlatformGlobalSales.nlargest(15, 'EU_Sales')[['EU_Sales']]

fig = px.bar(data_frame=PlatformGlobalSales, x=PlatformGlobalSales.index, y='EU_Sales', color=PlatformGlobalSales.index)
fig.update_layout(title_text='Top 15 most Sold Platforms - Europe',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**According to the Europe Market Platform sales, PS products have pretty good sales. PC sale has a higher rank than the North America and Global markets.**

# Top 15 most Sold Platforms - Japan


In [None]:
PlatformGlobalSales = data['JP_Sales'].groupby(data['Platform']).sum().sort_values(ascending=False).to_frame()
PlatformGlobalSales = PlatformGlobalSales.nlargest(15, 'JP_Sales')[['JP_Sales']]

fig = px.bar(data_frame=PlatformGlobalSales, x=PlatformGlobalSales.index, y='JP_Sales', color=PlatformGlobalSales.index)
fig.update_layout(title_text='Top 15 most Sold Platforms - Japan',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**DS sales are dominating the Japan market due to the graph above. PS and PS2 follow.**

# Top 15 most Sold Platforms - Other


In [None]:
PlatformGlobalSales = data['Other_Sales'].groupby(data['Platform']).sum().sort_values(ascending=False).to_frame()
PlatformGlobalSales = PlatformGlobalSales.nlargest(15, 'Other_Sales')[['Other_Sales']]

fig = px.bar(data_frame=PlatformGlobalSales, x=PlatformGlobalSales.index, y='Other_Sales', color=PlatformGlobalSales.index)
fig.update_layout(title_text='Top 15 most Sold Platforms - Other',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

# Distribution of the Video Game Sales by Genre


In [None]:
GenreTotalGames = data['Global_Sales'].groupby(data['Genre']).sum().sort_values(ascending=False).to_frame()

fig = go.Figure(data=[go.Pie(labels=GenreTotalGames.index,
                             values=GenreTotalGames['Global_Sales'], opacity=0.9)])
fig.update_traces(textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Video Game Sales by Genre',
                  title_x=0.5, title_font=dict(size=22))
fig.show()

# Platforms and their Game Genre Distribution

In [None]:
fig = px.sunburst(data_frame=data,
                  path=["Platform", "Genre"],
                  color="Platform",
                  maxdepth=-1,
                  branchvalues='total',
                  hover_name='Platform',
                  hover_data={'Platform': False},
                  title='Platforms and their Game Genre Distribution', template='ggplot2'
                  )

fig.update_traces(textinfo='label+percent parent')
fig.update_layout(font=dict(size=18))
fig.show()

# Number of Games Published by Publishers


In [None]:
PublisherCount = data.groupby(pd.Grouper(key='Publisher')).size().reset_index(name='count')
fig = px.treemap(PublisherCount, path=['Publisher'], values='count')
fig.update_layout(title_text='Number of Games Published by Publishers',
                  title_x=0.5, title_font=dict(size=22)
                  )
fig.update_traces(textinfo="label+value")
fig.show()


# Distribution of Top Seller 50 Games by Publishers


In [None]:
Top50byPublisher = data.nlargest(50, 'Global_Sales')[['Global_Sales', 'Name', 'Publisher']]
Top50byPublisher = Top50byPublisher.groupby(pd.Grouper(key='Publisher')).size().reset_index(name='Number of Games')
Top50byPublisher

In [None]:
fig = px.bar(data_frame=Top50byPublisher, x=Top50byPublisher['Publisher'], y='Number of Games',
             color=Top50byPublisher['Publisher'])
fig.update_layout(title_text='Distribution of Top Seller 50 Games by Publishers',
                  title_x=0.5, title_font=dict(size=20))
fig.update_layout(xaxis={'categoryorder': 'total descending'})
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.show()

**If we create a list of top 50 seller games to see which publisher did well, we obtain this distribution. According to the graph above, Nintendo has 32 games in the top 50 seller games. Activation follows the list with 8 games and so on.**

# Playstation Console Global Sales (PS + PSP)


In [None]:
PS = data[data['Platform'] == 'PS'].groupby('Year')['Global_Sales'].sum().reset_index()
PS2 = data[data['Platform'] == 'PS2'].groupby('Year')['Global_Sales'].sum().reset_index()
PS3 = data[data['Platform'] == 'PS3'].groupby('Year')['Global_Sales'].sum().reset_index()
PS4 = data[data['Platform'] == 'PS4'].groupby('Year')['Global_Sales'].sum().reset_index()
PSP = data[data['Platform'] == 'PSP'].groupby('Year')['Global_Sales'].sum().reset_index()
PSV = data[data['Platform'] == 'PSV'].groupby('Year')['Global_Sales'].sum().reset_index()

fig = go.Figure()
fig.add_trace(go.Scatter(x=PS['Year'], y=PS['Global_Sales'],
                         name="PS Sales",
                         hovertext=PS['Global_Sales']))

fig.add_trace(go.Scatter(x=PS2['Year'], y=PS2['Global_Sales'],
                         name="PS2 Sales",
                         hovertext=PS2['Global_Sales']))

fig.add_trace(go.Scatter(x=PS2['Year'], y=PS2['Global_Sales'],
                         name="PS2 Sales",
                         hovertext=PS2['Global_Sales']))

fig.add_trace(go.Scatter(x=PS3['Year'], y=PS3['Global_Sales'],
                         name="PS3 Sales",
                         hovertext=PS3['Global_Sales']))

fig.add_trace(go.Scatter(x=PS4['Year'], y=PS4['Global_Sales'],
                         name="PS4 Sales",
                         hovertext=PS4['Global_Sales']))

fig.add_trace(go.Scatter(x=PSP['Year'], y=PSP['Global_Sales'],
                         name="PSP Sales",
                         hovertext=PSP['Global_Sales']))

fig.add_trace(go.Scatter(x=PSV['Year'], y=PSV['Global_Sales'],
                         name="PSV Sales",
                         hovertext=PSV['Global_Sales']))

fig.update_layout(title_text='Playstation Console Global Sales (PS + PSP)',
                  title_x=0.5, title_font=dict(size=22))  # Location and the font size of the main title
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Global Sales (M)")

fig.show()


# Playstation vs PC Global Sales Comparison


In [None]:
PS = data[data['Platform'] == 'PS'].groupby('Year')['Global_Sales'].sum().reset_index()
PS2 = data[data['Platform'] == 'PS2'].groupby('Year')['Global_Sales'].sum().reset_index()
PS3 = data[data['Platform'] == 'PS3'].groupby('Year')['Global_Sales'].sum().reset_index()
PS4 = data[data['Platform'] == 'PS4'].groupby('Year')['Global_Sales'].sum().reset_index()
PC = data[data['Platform'] == 'PC'].groupby('Year')['Global_Sales'].sum().reset_index()

fig = go.Figure()
fig.add_trace(go.Scatter(x=PS['Year'], y=PS['Global_Sales'],
                         name="PS Sales",
                         hovertext=PS['Global_Sales']))

fig.add_trace(go.Scatter(x=PS2['Year'], y=PS2['Global_Sales'],
                         name="PS2 Sales",
                         hovertext=PS2['Global_Sales']))

fig.add_trace(go.Scatter(x=PS2['Year'], y=PS2['Global_Sales'],
                         name="PS2 Sales",
                         hovertext=PS2['Global_Sales']))

fig.add_trace(go.Scatter(x=PS3['Year'], y=PS3['Global_Sales'],
                         name="PS3 Sales",
                         hovertext=PS3['Global_Sales']))

fig.add_trace(go.Scatter(x=PS4['Year'], y=PS4['Global_Sales'],
                         name="PS4 Sales",
                         hovertext=PS4['Global_Sales']))

fig.add_trace(go.Scatter(x=PC['Year'], y=PC['Global_Sales'],
                         name="PC Sales",
                         hovertext=PC['Global_Sales']))

fig.update_layout(title_text='Playstation vs PC Global Sales Comparison',
                  title_x=0.5, title_font=dict(size=22))  # Location and the font size of the main title
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Global Sales (M)")

fig.show()