# First Kaggle: Video Sales Data Visualisation and Predicting Global Sales

1. Plot stacked bar chart, line plot and bar plot to examine the relationship of different factors with Global_Sales
2. Use Bokeh to create a interactive plot for cumulative sales for top 30 publishers
3. Create a Random Forest Regression Tree to predict global sales.

In [None]:
import numpy as np
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import pandas as pd
import seaborn as sns
from math import ceil

from bokeh.io import curdoc, show
from bokeh.layouts import column, layout
from bokeh.models import ColumnDataSource, CustomJSFilter, Slider, CustomJS, DateRangeSlider
from bokeh.plotting import figure, output_file, output_notebook, show
from datetime import date

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/videogamesales/vgsales.csv')

**1.1 Check the data structure**

In [None]:
data.info()

**1.2. Check Null Value**

In [None]:
data.isnull().sum()

**1.3. Exclude the data containing the NULL values**

In [None]:
data = data[data['Year'].isnull() == False]
data = data[data['Publisher'].isnull() == False]

In [None]:
data.isnull().sum()

**1.4 Check the sales of each Platform in each regions by percentage**


In [None]:
names = list(data.columns)[6:11]

# Calculate the sum of urban by party
grouped = data.groupby('Platform')[names].sum()
grouped = grouped.T

greenBars = []
orangeBars = []
blueBars = []
greyBars = []

for j in range(0, len(grouped.columns)):
    greenBars = greenBars + [grouped.iloc[0,j]/grouped.iloc[4,j]]
    orangeBars = orangeBars + [grouped.iloc[1,j]/grouped.iloc[4,j]]
    blueBars = blueBars + [grouped.iloc[2,j]/grouped.iloc[4,j]]
    greyBars = greyBars + [grouped.iloc[3,j]/grouped.iloc[4,j]]

r = list(range(len(grouped.columns)))

# plot
barWidth = 0.85
platform_names = list(grouped.columns)
# Create green Bars - 1
plt.bar(r, greenBars, color='#b5ffb9', edgecolor='white', width=barWidth, label = 'NA')
# Create orange Bars - 2
plt.bar(r, orangeBars, bottom=greenBars, color='#f9bc86', edgecolor='white', width=barWidth, label = 'EU')
# Create blue Bars - 3
plt.bar(r, blueBars, bottom=[i+j for i,j in zip(greenBars, orangeBars)], color='#a3acff', edgecolor='white', width=barWidth, label = 'Japan')
# Create grey Bars - 3
plt.bar(r, greyBars, bottom=[i+j for i,j,k in zip(greenBars, orangeBars, blueBars)], color='#888a99', edgecolor='white', width=barWidth, label = 'Other')

# Custom x axis
plt.xticks(r, platform_names)
plt.xticks(rotation=90)
plt.xlabel("Platform")

#Add legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.show()

Total does not add up to 1 because of rounding

In [None]:
data['Year'].unique()
len(data['Year'].unique())

**1.5. Find out the Sales in each Year for each Region by %**

In [None]:
names = list(data.columns)[6:11]

# Calculate the sum of urban by party
grouped = data.groupby('Year')[names].sum()
grouped = grouped.T

greenBars = []
orangeBars = []
blueBars = []
greyBars = []

for j in range(0, len(grouped.columns)):
    greenBars = greenBars + [grouped.iloc[0,j]/grouped.iloc[4,j]]
    orangeBars = orangeBars + [grouped.iloc[1,j]/grouped.iloc[4,j]]
    blueBars = blueBars + [grouped.iloc[2,j]/grouped.iloc[4,j]]
    greyBars = greyBars + [grouped.iloc[3,j]/grouped.iloc[4,j]]

r = list(range(len(grouped.columns)))

# plot
barWidth = 0.85
platform_names = list(grouped.columns)
# Create green Bars - 1
plt.bar(r, greenBars, color='#b5ffb9', edgecolor='white', width=barWidth, label = 'NA')
# Create orange Bars - 2
plt.bar(r, orangeBars, bottom=greenBars, color='#f9bc86', edgecolor='white', width=barWidth, label = 'EU')
# Create blue Bars - 3
plt.bar(r, blueBars, bottom=[i+j for i,j in zip(greenBars, orangeBars)], color='#a3acff', edgecolor='white', width=barWidth, label = 'Japan')
# Create grey Bars - 3
plt.bar(r, greyBars, bottom=[i+j for i,j,k in zip(greenBars, orangeBars, blueBars)], color='#888a99', edgecolor='white', width=barWidth, label = 'Other')

# Custom x axis
plt.xticks(r, platform_names)
plt.xticks(rotation=90)
plt.xlabel("Year")

#Add legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.show()

The data for 2017 and 2020 seems to be incomplete. Let's remove the data in 2017 and 2020

In [None]:
data = data[data['Year'] != 2017]
data = data[data['Year'] != 2020]

**1.6. Look at the Global Sales by Year**

In [None]:
grouped = pd.DataFrame(data.groupby(['Year'])['Global_Sales'].sum()).reset_index()

sns.lineplot(x='Year', y='Global_Sales', data=grouped)

**1.7. Global Sales by Platform**

In [None]:
grouped = pd.DataFrame(data.groupby(['Platform'])['Global_Sales'].sum()).reset_index()

grouped = grouped.sort_values(by='Global_Sales', ascending=False)

plt.bar(grouped['Platform'], grouped['Global_Sales'])
plt.xticks(rotation=90)
plt.show()

**1.8 Global Sales by Genre**

In [None]:
grouped = pd.DataFrame(data.groupby(['Genre'])['Global_Sales'].sum()).reset_index()

grouped = grouped.sort_values(by='Global_Sales', ascending=False)

plt.bar(grouped['Genre'], grouped['Global_Sales'])
plt.xticks(rotation=90)
plt.show()

**1.9 Global Sales by Genre by Year**

In [None]:
# Calculate the sum of urban by party
grouped = pd.DataFrame(data.groupby(['Year','Genre'])['Global_Sales'].sum()).reset_index()

# platform = list(set(data['Platform']))[1:5]
# grouped_1 = grouped[grouped['Platform'].isin(platform)]

fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True, figsize=(16,20))
num_graph = 4
id_per_graph = ceil(len(grouped.Genre.unique())/ num_graph)
count = 0
for i in range(2):
    for j in range(2):
        genre = list(set(data['Genre']))[count*id_per_graph:(count+1)*id_per_graph]
        sns.lineplot(x='Year', y='Global_Sales', hue='Genre', data=grouped[grouped['Genre'].isin(genre)], ax=axes[i][j])
        count += 1

**1.10 Global Sales by Genre by Year**

In [None]:
data.Publisher.unique()

In [None]:
# names = list(data.columns)[6:11]

# Calculate the sum of urban by party
grouped = pd.DataFrame(data.groupby('Publisher')['Global_Sales'].sum()).reset_index()
grouped = grouped.sort_values(by='Global_Sales', ascending=False)
grouped_1 = grouped.head(30)
plt.boxplot(grouped_1['Global_Sales']) 
  
# show plot 
plt.show() 

grouped_1 = grouped_1.T

r = list(range(len(grouped_1.iloc[0])))

# plot

# plt.ylim(0,100)
top_30_publisher_names = list(grouped_1.iloc[0])
# Create green Bars - 1
plt.bar(r, grouped_1.iloc[1])

# Custom x axis
plt.xticks(r, top_30_publisher_names)
plt.xticks(rotation=90)
plt.xlabel("Publisher")

#Add legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show graphic
plt.show()

**2.1 Global Sales by Publishers in 2008**

In [None]:
output_file('vbar.html')

dataset = data[data['Publisher'].isin(top_30_publisher_names)]

# Calculate the sum of urban by party

dataset = dataset[dataset['Year']==2008]
grouped = pd.DataFrame(dataset.groupby(['Publisher'])['Global_Sales'].sum()).reset_index()
grouped = grouped.sort_values(by='Global_Sales')
dataset1 = {'publishers' : list(grouped.Publisher.unique()),
        'global_sales' : list(grouped['Global_Sales'])}

source = ColumnDataSource(data=dataset1)
publishers = list(grouped.Publisher.unique())

p = figure(y_range = publishers, plot_width=500, plot_height=400, title="Global Sales by Publishers in 2008")
p.hbar(y = 'publishers', height=0.5,
       right = 'global_sales', source = source, color="firebrick")

output_notebook()
show(p)

Get the cumulative sales for top 30 publishers

In [None]:
dataset = data[data['Publisher'].isin(top_30_publisher_names)].reset_index()

grouped = pd.DataFrame(dataset.groupby(['Year','Publisher'])['Global_Sales'].sum()).reset_index()
grouped = grouped.sort_values(by=['Year','Global_Sales']).reset_index()

cumulative_sales = []

Year = list(range(1980,2017))

for i in range(len(top_30_publisher_names)):
    for j in range(len(Year)):
        if sum(grouped.loc[(grouped['Year']==Year[j]) & (grouped['Publisher']==top_30_publisher_names[i]),'Global_Sales']) == 0:
            a_row = {'index': len(grouped)+1, 
                     'Year': Year[j], 
                     'Publisher': top_30_publisher_names[i], 
                     'Global_Sales': 0}
            row_df = pd.DataFrame([a_row])
            grouped = pd.concat([row_df, grouped], ignore_index=True)

grouped = grouped.sort_values(by=['Year','Global_Sales']).reset_index()          
            
for i in range(len(grouped)):
    cumulative_sales = cumulative_sales + [sum(grouped.loc[(grouped['Year']<=grouped['Year'][i]) & (grouped['Publisher']==grouped['Publisher'][i]),'Global_Sales'])]
    
grouped['Cumulative_sales'] = cumulative_sales

**2.2 Top 30 Publishers cumulative sales in 2016**

In [None]:
output_file('vbar.html')

grouped1 = grouped[grouped['Year']==2016]
grouped1 = pd.DataFrame(grouped1.groupby(['Publisher'])['Cumulative_sales'].sum()).reset_index()
grouped1 = grouped1.sort_values(by='Cumulative_sales')
dataset1 = {'publishers' : list(grouped1.Publisher.unique()),
        'global_sales' : list(grouped1['Cumulative_sales'])}

source = ColumnDataSource(data=dataset1)
publishers = list(grouped1.Publisher.unique())

p = figure(y_range = publishers, plot_width=1000, plot_height=400, title="Cumulative Sales by Publishers in 2016")
p.hbar(y = 'publishers', height=0.5,
       right = 'global_sales', source = source, color="firebrick")

output_notebook()

show(p)

**2.3. Top 30 Publishers cumulative sales from 1980 to 2016**

In [None]:
output_file('vbar.html')

true_source = ColumnDataSource(data={'year': grouped.loc[:,'Year'],
                                    'publishers': grouped.loc[:,'Publisher'],
                                    'global_sales': grouped.loc[:,'Cumulative_sales']})
source = ColumnDataSource(data={'year': grouped.loc[grouped['Year']==2016,'Year'],
                                'publishers': grouped.loc[grouped['Year']==2016,'Publisher'],
                                'global_sales': grouped.loc[grouped['Year']==2016,'Cumulative_sales']})
callback = CustomJS(args=dict(source=source, ts=true_source), code='''
                    var f=cb_obj.value;

                    var data = ts.data;

                    var data1 = source.data;
                    var year = [];
                    var publishers = [];
                    var global_sales = [];

                    // iterate through rows of data source and see if each satisfies some constraint
                    for (var i = 0; i < ts.get_length(); i++){
                        if (data['year'][i] == f){
                            year.push(data['year'][i]);
                            publishers.push(data['publishers'][i]);
                            global_sales.push(data['global_sales'][i]);
                        }
                    }

                    data1['year'] = year
                    data1['publishers'] = publishers
                    data1['global_sales'] = global_sales

                    
                    source.change.emit();
            ''')

#                     data1['global_sales'].sort(function(a, b){return a - b})

p = figure(y_range = publishers, plot_width=1000, plot_height=400, title="Cumulative Sales by Publishers in from 1980 to 2016")
p.hbar(y = 'publishers', height=0.5,
       right = 'global_sales', source = source, color="firebrick")

date_slider = Slider(title = 'Year', value=(2016),
                    start=Year[0], end=Year[-1], max_width = 400)
date_slider.js_on_change('value', callback)

output_notebook()

layout = column(date_slider, p)

show(layout)

**3.0 Random Forest Tree Regression to predict sales of top 30 publishers**

In [None]:
dataset = data[data['Publisher'].isin(top_30_publisher_names)]
dataset = dataset[dataset['Global_Sales']>=1]
dataset = pd.DataFrame({'Genre': dataset['Genre'], 'Publisher': dataset['Publisher'], 'Year':dataset['Year'], 'Global_Sales':dataset['Global_Sales']})

X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
X[:, 1] = labelencoder.fit_transform(X[:, 1])

ct = ColumnTransformer([("Genre", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)

X = X.toarray()

ct = ColumnTransformer([("Publisher", OneHotEncoder(), [12])], remainder = 'passthrough')
X = ct.fit_transform(X)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

scaler = MinMaxScaler()
y_train = scaler.fit_transform(y_train.reshape(-1,1))

In [None]:
# Fitting Random Forest Regression to the dataset

from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)
# Predicting a new result
predict_train = regressor.predict(X_train)
predict_test = regressor.predict(X_test)

y_test = scaler.fit_transform(y_test.reshape(-1,1))

import math
from sklearn.metrics import mean_squared_error
trainScore = math.sqrt(mean_squared_error(predict_train, y_train))
print('Train Score: %.2f RMSE' % (trainScore))
valScore = math.sqrt(mean_squared_error(predict_test, y_test))
print('Test Score: %.2f RMSE' % (valScore))