# Video Game Sales Analysis
### The data stretches from the 1980s to 2016 (with few exceptions)

In [None]:
import os
import numpy as np 
import pandas as pd 
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
vidgme_df = pd.read_csv("/kaggle/input/videogamesales/vgsales.csv")
vidgme_df = vidgme_df.dropna()
sales = ["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]

In [None]:
vidgme_df

# Yearly sales analysis

### Correlation between year & number of titles

In [None]:
# Correlation between year & number of titles 
vidgme_df.groupby(["Year"]).size().to_frame('Count').reset_index().corr().iloc[0][1]

In [None]:
desc = vidgme_df.groupby(["Year"])[["Global_Sales"]].describe()
desc = desc.drop("min",level=1, axis=1)
desc.style.background_gradient(cmap ='Spectral')

In [None]:
from itertools import islice
corr_list = []
for yr, group in islice(vidgme_df.groupby(["Year"]), 10, None, 5):
    df = group[["NA_Sales", "JP_Sales", "EU_Sales"]].corr()
    df = pd.concat([df], keys=[yr])
    corr_list.append(df)

corr_df = pd.concat(corr_list, levels=0)
corr_df.style.background_gradient(cmap ='coolwarm')

# Company sales analysis

In [None]:
top_50_pubs = vidgme_df.groupby(["Publisher"]).sum().nlargest(50, columns=["Global_Sales"])[sales]
top_50_pubs

In [None]:
def create_sales_pie(publisher, num_games=50):
    fig = px.pie(vidgme_df[vidgme_df["Publisher"] == publisher][:num_games],  names="Name", values="Global_Sales", hole=.6, 
                title=f"{publisher}'s top {num_games} highest selling games")
    fig.update_traces(textposition='outside', textinfo='label')
    return fig

def create_genre_pie(publisher):
    fig = px.pie(vidgme_df[vidgme_df["Publisher"] == publisher], names="Genre", values="Global_Sales", color="Genre", hole=.6, 
                 color_discrete_sequence=px.colors.qualitative.Pastel, title=f"{publisher}'s favorite genres")
    fig.update_traces(textposition='outside', textinfo='label')
    return fig

In [None]:
create_genre_pie("Nintendo")

In [None]:
create_genre_pie("Activision")

In [None]:
create_genre_pie("Atlus")

In [None]:
create_genre_pie("Take-Two Interactive")

In [None]:
create_sales_pie("Nintendo", num_games=50)

In [None]:
create_sales_pie("Activision", num_games=50)

In [None]:
create_sales_pie("Atlus", num_games=50)

In [None]:
create_sales_pie("Take-Two Interactive")

In [None]:
def create_sales_figure(publisher):
    sale_per_yr = vidgme_df[vidgme_df["Publisher"] == publisher].groupby(["Year"])[sales].sum()
    
    fig = make_subplots(rows=2, row_heights=[0.3, 0.7])
    fig.update_layout({"height": 1000, "yaxis_range":[0, 1], "barmode":'stack', "title": f"{publisher} Sales report by year"})

    proportion = sale_per_yr["NA_Sales"] / (sale_per_yr["NA_Sales"] + sale_per_yr["JP_Sales"])
    fig.add_trace(
        go.Scatter(x=proportion.index, y=proportion.values, text=proportion.index, name="NA sales to JP proportion"),
        row=1, col=1, 
    )

    for sale in sales:
        fig.add_trace(go.Bar(x=sale_per_yr.index, y=sale_per_yr[sale], text=sale_per_yr.index, name=sale), row=2, col=1)

    return fig

In [None]:
create_sales_figure("Take-Two Interactive")

In [None]:
create_sales_figure("Nintendo")

In [None]:
create_sales_figure("Sony Computer Entertainment")

In [None]:
create_sales_figure("Ubisoft")

In [None]:
create_sales_figure("Namco Bandai Games")

# **Platform analysis**

In [None]:
vidgme_df["Platform"].unique()


In [None]:
def get_genre_breakdown(platform):
    breakdown = vidgme_df[vidgme_df["Platform"] == platform].pivot_table(index=["Year", "Genre"])[sales]
    return breakdown.style.background_gradient(cmap="PuBu")

In [None]:
get_genre_breakdown("DS")

## We can see that From 2004 until 2012, DS games most popular genre changed frequently. In 2004, at the peak of DS sales, the Platformer genre was in the lead, then in 2005 Racing and Simulation held the top. <br> After 2005 til 2012 every genre was fairly profitable but none stood out except RPGs, which gradually gained steam (which makes sense, RPGs take a long time to develope)

In [None]:
vidgme_df[(vidgme_df["Platform"] == "DS") & (vidgme_df["Year"] == 2004)].drop(["Rank"], axis=1)

## From this we can clearly see that the catalyst of 2004's Platformer craze was Super Mario 64

In [None]:
vidgme_df[(vidgme_df["Platform"] == "DS") & (vidgme_df["Year"] == 2005)].drop(["Rank"], axis=1)

In [None]:
vidgme_df[(vidgme_df["Platform"] == "DS") & (vidgme_df["Year"] == 2011)].drop(["Rank"], axis=1)

## Comparing 2011's sales report to 2004/2005 we can clearly see Nintendo's strategy of strong first party games to get the system rolling, and third parties taking the charge for the later years (compare Publisher column from 2011 and 2005)

In [None]:
get_genre_breakdown("Wii")

In [None]:
get_genre_breakdown("3DS")

# Dangerous area

In [None]:
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, OrdinalEncoder
from sklearn.cluster import KMeans

In [None]:
encoded_vidgme_df = vidgme_df.drop(["Name"], axis=1)
# names = vidgme_df["Name"].values.reshape(len(vidgme_df), 1)
# categories = ["Platform", "Year","Publisher", "Genre"]

# ode = OrdinalEncoder()
# encoded_vidgme_df[categories] = ode.fit_transform(vidgme_df[categories])

In [None]:
# ests = []
# for sale in sales + ["Global_Sales", "Rank"]:
#     reshaped = vidgme_df[sale].values.reshape(len(vidgme_df), 1)
    
#     est = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
#     encoded_vidgme_df[sale] = est.fit_transform(reshaped)
    
#     ests.append(est)

In [None]:
ohe = OneHotEncoder()
data = ohe.fit_transform(encoded_vidgme_df[["Platform","Publisher", "Genre"]]).toarray()

In [None]:
kmeans = KMeans(n_clusters=200)
kmeans.fit(data)
clusters = kmeans.predict(data)

In [None]:
cluster_vidgme_df = vidgme_df
cluster_vidgme_df["Cluster"] = clusters

In [None]:
cluster_vidgme_df[cluster_vidgme_df["Cluster"] == 24]

In [None]:
cluster_vidgme_df[cluster_vidgme_df["Name"] == "Kirby Super Star Ultra"]

In [None]:
cluster_vidgme_df[cluster_vidgme_df["Cluster"] == 83]