In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import cufflinks as cf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)
cf.go_offline()

In [None]:
vsale = pd.read_csv('/kaggle/input/videogamesales/vgsales.csv')

In [None]:
vsale.info()

In [None]:
vsale.head()

#### Checking the missing value is the first thing I always do before any EDA.

In [None]:
vsale.isna().sum()

#### Now lets see if any of these games with missing values make significant impact to the Sales.

In [None]:
vsale[vsale.isna().any(axis = 1)]['Global_Sales'].sort_values(ascending = False).head()

# Doesn't seem like

#### It is safe to remove the missing Data

In [None]:
vsale.dropna(inplace = True)

vsale.info()

## Visualization

### What are the top 10 games sold

In [None]:
vsale_n = vsale.sort_values('Global_Sales', ascending = False).head(10)

plt.figure(figsize = (10,5))
sns.barplot(data = vsale_n, x = 'Name', y = 'Global_Sales')

plt.tight_layout()
plt.title('Global Sales by Games')
plt.xlabel('Game')
plt.xticks(rotation = 45)

In [None]:
vsale_p = vsale.groupby('Platform').sum().reset_index()

vsale_p.loc[vsale_p.Global_Sales < 100, 'Platform'] = 'Other'
    
fig = px.pie(vsale_p,
       values = 'Global_Sales',
       names = 'Platform',
       hole = 0.3,
       opacity = 0.8)

fig.add_annotation(text = 'Platform', x = 0.5, y = 0.5, font_size = 15,
                   showarrow = False, font_family = 'monospace')
fig.update_layout(
    font_family = 'monospace',
    hoverlabel = dict(bgcolor = 'white'))
                

In [None]:
vsale_g = vsale.groupby('Genre').sum().reset_index()

fig = px.histogram(vsale_g, y = 'Global_Sales',
                   x = 'Genre').update_xaxes(categoryorder= 'total descending')
#fig.update_traces(textinfo='value+percent')
fig.show()

In [None]:
vsale_gy = vsale.groupby(['Genre', 'Year']).sum().reset_index()

px.histogram(vsale_gy,
             x = 'Genre',
             y = 'Global_Sales',
             animation_frame='Year').update_xaxes(categoryorder='total descending')

### Top 10 Publishers

In [None]:
vsale.groupby('Publisher').sum().reset_index().sort_values('Global_Sales', ascending = False).head(10)

### Video Games Sales trend over the years

In [None]:
vsale_y = vsale.groupby('Year').sum().reset_index()

fig = px.bar(vsale_y, x = 'Year', y = 'Global_Sales', text = 'Global_Sales', 
       title = 'Video Games Sales since 1980')
fig.update_traces(textposition = 'outside')

In [None]:
vsale.groupby(['Platform', 'Genre']).sum().sort_values('NA_Sales', ascending = False).head(10).reset_index()

#### Top Genre

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(data = vsale, x = 'Genre',
              order = vsale['Genre'].value_counts().index)
plt.tight_layout()

In [None]:
vsale_p =  vsale.groupby('Genre').sum()

vsale_p.reset_index(inplace = True)

plt.figure(figsize = (10,5))
sns.barplot(data = vsale_p, x = 'Genre', y = 'Global_Sales',
           order = vsale_p.sort_values('Global_Sales', ascending = False).Genre)
plt.tight_layout()
plt.xticks(rotation=45)

plt.title('Sales by Genre')

In [None]:
from sklearn.cluster import KMeans

In [None]:
X = vsale.loc[:,'NA_Sales':].values

### Clustering

In [None]:
kmeans = KMeans(n_clusters = 5)
kmeans.fit(X)

In [None]:
vsale['Clusters'] = kmeans.labels_

In [None]:
vsale['Clusters'] = vsale['Clusters'].astype(str)

#.astype(str, inplace = True)

In [None]:
px.scatter(vsale,
           x = 'NA_Sales',
           y = 'Global_Sales',
          color = 'Clusters',
          hover_data = ['Name','Genre','Publisher', 'Platform', 'Year'])

In [None]:
vsale.head()

In [None]:
vsale1 = vsale.copy()

In [None]:
vsale['Clusters'] = vsale1.loc[vsale1.Clusters == 1, 'Clusters']='class'

In [None]:
vsale1.head()