Hello! In this notebook, I am going to be analysing the Happiness Report dataset, see the correlations of the other factors with the Score. I will also analyse if game sales correlate to Happiness Scores as well.

This notebook is divided into several sections:
1. Preprocessing Happiness Report
2. Data Analysis (and modelling) on Happiness Report
3. Preprocessing Video Game Sales and Happiness Report for analysis with the Video Game Sales
4. Data Analysis on Video Game Sales and Happiness Report
5. Conclusions
6. Appendix

First, import necessary libraries and also load the files

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting
import re

# read input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 1. Preprocessing Happiness Report

I am preprocessing the Happiness data by cleaning them from empty values, and combining them into a single table for easier use later on

In [None]:
# load happiness dataset and preprocess it
data2015 = pd.read_csv("../input/world-happiness/2015.csv")
data2016 = pd.read_csv("../input/world-happiness/2016.csv")
data2017 = pd.read_csv("../input/world-happiness/2017.csv")
data2018 = pd.read_csv("../input/world-happiness/2018.csv")
data2019 = pd.read_csv("../input/world-happiness/2019.csv")

In [None]:
# simplify column names
data2015.columns = ['Country', 'Region', 'Rank', 'Score',
                    'SE', 'GDP', 'Family', 'LifeExpectancy', 'Freedom', 'Corruption',
                    'Generosity', 'DystopiaResidual']

data2016.columns = ['Country', 'Region', 'Rank', 'Score',
                    'CILower', 'CIUpper', 'GDP', 'Family', 
                    'LifeExpectancy', 'Freedom', 'Corruption', 'Generosity', 
                    'DystopiaResidual']

data2017.columns = ['Country', 'Rank', 'Score','WhiskerHigh',
                    'WhiskerLow','GDP','Family', 'LifeExpectancy',
                    'Freedom','Generosity','Corruption','DystopiaResidual']

data2018.columns = ['Rank', 'Country', 'Score','GDP', 
                    'Social','LifeExpectancy','Freedom', 
                    'Generosity', 'Corruption']

data2019.columns = ['Rank', 'Country', 'Score', 'GDP',
                    'Social', 'LifeExpectancy', 'Freedom', 
                    'Generosity', 'Corruption']

In [None]:
# and mean the values from each tables
d2015 = data2015[['Country', 'Score']]
d2016 = data2016[['Country', 'Score']]
d2017 = data2017[['Country', 'Score']]
d2018 = data2018[['Country', 'Score']]
d2019 = data2019[['Country', 'Score']]
score_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country').merge(
    d2018, left_on='Country', right_on='Country').merge(
    d2019, left_on='Country', right_on='Country')
score_avg = score_df.iloc[:, 1:6].apply(np.mean, axis=1)

In [None]:
d2015 = data2015[['Country', 'GDP']]
d2016 = data2016[['Country', 'GDP']]
d2017 = data2017[['Country', 'GDP']]
d2018 = data2018[['Country', 'GDP']]
d2019 = data2019[['Country', 'GDP']]
gdp_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country').merge(
    d2018, left_on='Country', right_on='Country').merge(
    d2019, left_on='Country', right_on='Country')
gdp_avg = gdp_df.iloc[:, 1:6].apply(np.mean, axis=1)

In [None]:
d2015 = data2015[['Country', 'GDP']]
d2016 = data2016[['Country', 'GDP']]
d2017 = data2017[['Country', 'GDP']]
fam_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country')
fam_avg = fam_df.iloc[:, 1:4].apply(np.mean, axis=1)

In [None]:
d2015 = data2015[['Country', 'LifeExpectancy']]
d2016 = data2016[['Country', 'LifeExpectancy']]
d2017 = data2017[['Country', 'LifeExpectancy']]
d2018 = data2018[['Country', 'LifeExpectancy']]
d2019 = data2019[['Country', 'LifeExpectancy']]
lfe_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country').merge(
    d2018, left_on='Country', right_on='Country').merge(
    d2019, left_on='Country', right_on='Country')
lfe_avg = lfe_df.iloc[:, 1:6].apply(np.mean, axis=1)

In [None]:
d2015 = data2015[['Country', 'Corruption']]
d2016 = data2016[['Country', 'Corruption']]
d2017 = data2017[['Country', 'Corruption']]
d2018 = data2018[['Country', 'Corruption']]
d2019 = data2019[['Country', 'Corruption']]
crp_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country').merge(
    d2018, left_on='Country', right_on='Country').merge(
    d2019, left_on='Country', right_on='Country')
crp_avg = crp_df.iloc[:, 1:6].apply(np.mean, axis=1)

In [None]:
d2015 = data2015[['Country', 'Generosity']]
d2016 = data2016[['Country', 'Generosity']]
d2017 = data2017[['Country', 'Generosity']]
d2018 = data2018[['Country', 'Generosity']]
d2019 = data2019[['Country', 'Generosity']]
gen_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country').merge(
    d2018, left_on='Country', right_on='Country').merge(
    d2019, left_on='Country', right_on='Country')
gen_avg = gen_df.iloc[:, 1:6].apply(np.mean, axis=1)

In [None]:
d2015 = data2015[['Country', 'DystopiaResidual']]
d2016 = data2016[['Country', 'DystopiaResidual']]
d2017 = data2017[['Country', 'DystopiaResidual']]
dysr_df = d2015.merge(d2016, left_on='Country', right_on='Country').merge(
    d2017, left_on='Country', right_on='Country')
dysr_avg = dysr_df.iloc[:, 1:4].apply(np.mean, axis=1)

In [None]:
d2016 = data2016[['Country', 'CILower']]
d2017 = data2017[['Country', 'WhiskerLow']]
cilow_df = d2016.merge(d2017, left_on='Country', right_on='Country')
cilow_avg = cilow_df.iloc[:, 1:5].apply(np.mean, axis=1)

In [None]:
d2016 = data2016[['Country', 'CIUpper']]
d2017 = data2017[['Country', 'WhiskerHigh']]
cihigh_df = d2016.merge(d2017, left_on='Country', right_on='Country')
cihigh_avg = cihigh_df.iloc[:, 1:3].apply(np.mean, axis=1)

In [None]:
# make new dataframe, and use the mean values of each column for the new dataframe
data_all = pd.DataFrame({'Country':score_df['Country'],
                            'Score':score_avg,
                            'GDP':gdp_avg,
                            'Family':fam_avg,
                            'LifeExpectancy':lfe_avg,
                            'Corruption':crp_avg,
                            'Generosity':gen_avg,
                            'DystopiaResidual':dysr_avg,
                            'CILower':cilow_avg,
                            'CIUpper':cihigh_avg})

In [None]:
# drop rows with empty values, and save dataframe to a csv file
data_all = data_all.dropna()
data_all.shape
data_all.to_csv("all.csv", index=False)

In [None]:
# load resulting CSV file
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

I am also going to use the 2016 Happiness Report data for comparison with the Video Game Sales data. I select 2016 because it is the latest data available for the Video Game Sales dataset.

In [None]:
# load the cleaned Happiness report dataset
cleaned_df = pd.read_csv("../working/all.csv")
# use the 2016 data for comparison against cleaned data, because its most recent
happiness = data2016
# look at the head of the 2016 data
happiness.head()

In [None]:
# describe the dataframe
happiness_summary = happiness.describe()
happiness_summary = happiness_summary.transpose()
happiness_summary.head()

In [None]:
# look at the head of the cleaned data
cleaned_df.head()

In [None]:
# describe the dataframe
cleaned_summary = cleaned_df.describe()
cleaned_summary = cleaned_summary.transpose()
cleaned_summary.head()

# 2. Data Analysis (and modelling) on Happiness Report

Doing data analysis on the cleaned World Happiness Report dataset by comparing the different features with the Happiness scores

In [None]:
# divide countries into categories for density
happycat1 = cleaned_df[cleaned_df['Score'].between(2, 4)]
happycat2 = cleaned_df[cleaned_df['Score'].between(4, 5)]
happycat3 = cleaned_df[cleaned_df['Score'].between(5, 6)]
happycat4 = cleaned_df[cleaned_df['Score'].between(6, 7)]
happycat5 = cleaned_df[cleaned_df['Score'].between(7, 10)]

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of GDP vs Score
x = cleaned_df['GDP']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs GDP")
ax1.set_xlabel("GDP")
ax1.set_ylabel("Happiness Score")

# plot density graph of GDP vs Score
sns.kdeplot(data=happycat1['GDP'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['GDP'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['GDP'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['GDP'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['GDP'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs GDP Density")
ax2.set_xlabel("GDP")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of Family vs Score
x = cleaned_df['Family']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs Family")
ax1.set_xlabel("Family")
ax1.set_ylabel("Happiness Score")

# plot density graph of Family vs Score
sns.kdeplot(data=happycat1['Family'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['Family'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['Family'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['Family'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['Family'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs Family Density")
ax2.set_xlabel("Family")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of LifeExpectancy vs Score
x = cleaned_df['LifeExpectancy']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs Life Expectancy")
ax1.set_xlabel("Life Expectancy")
ax1.set_ylabel("Happiness Score")

# plot density graph of LifeExpectancy vs Score
sns.kdeplot(data=happycat1['LifeExpectancy'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['LifeExpectancy'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['LifeExpectancy'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['LifeExpectancy'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['LifeExpectancy'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs Life Expectancy Density")
ax2.set_xlabel("Life Expectancy")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of Corruption vs Score
x = cleaned_df['Corruption']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs Corruption")
ax1.set_xlabel("Corruption")
ax1.set_ylabel("Happiness Score")

# plot density graph of Corruption vs Score
sns.kdeplot(data=happycat1['Corruption'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['Corruption'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['Corruption'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['Corruption'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['Corruption'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs Corruption Density")
ax2.set_xlabel("Corruption")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of Generosity vs Score
x = cleaned_df['Generosity']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs Generosity")
ax1.set_xlabel("Generosity")
ax1.set_ylabel("Happiness Score")

# plot density graph of Generosity vs Score
sns.kdeplot(data=happycat1['Generosity'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['Generosity'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['Generosity'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['Generosity'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['Generosity'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs Generosity Density")
ax2.set_xlabel("Generosity")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of DystopiaResidual vs Score
x = cleaned_df['DystopiaResidual']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs Dystopia Residual")
ax1.set_xlabel("Dystopia Residual")
ax1.set_ylabel("Happiness Score")

# plot density graph of DystopiaResidual vs Score
sns.kdeplot(data=happycat1['DystopiaResidual'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['DystopiaResidual'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['DystopiaResidual'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['DystopiaResidual'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['DystopiaResidual'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs Dystopia Residual Density")
ax2.set_xlabel("Dystopia Residual")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of CILower vs Score
x = cleaned_df['CILower']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs CILower")
ax1.set_xlabel("CILower")
ax1.set_ylabel("Happiness Score")

# plot density graph of CILower vs Score
sns.kdeplot(data=happycat1['CILower'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['CILower'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['CILower'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['CILower'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['CILower'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs CILower Density")
ax2.set_xlabel("CILower")
ax2.set_ylabel("Happiness Score")
# plt.show()

In [None]:
# plot line and density together
plt.style.use('ggplot')

fig, (ax1,ax2) = plt.subplots(1,2)
fig.set_size_inches(20,10)

# plot line graph of CILower vs Score
x = cleaned_df['CIUpper']
y = cleaned_df['Score']
x = x.sort_values(ascending = False)

ax1.plot(x,y,color='black',linewidth=1)
ax1.title.set_text("Happiness Score vs CIUpper")
ax1.set_xlabel("CIUpper")
ax1.set_ylabel("Happiness Score")

# plot density graph of CIUpper vs Score
sns.kdeplot(data=happycat1['CIUpper'], label="2-4", color='salmon',ax=ax2)
sns.kdeplot(data=happycat2['CIUpper'], label="4-5", color='y',ax=ax2)
sns.kdeplot(data=happycat3['CIUpper'], label="5-6", color='mediumseagreen',ax=ax2)
sns.kdeplot(data=happycat4['CIUpper'], label="6-7", color='deepskyblue',ax=ax2)
sns.kdeplot(data=happycat5['CIUpper'], label="7-10", color='violet',ax=ax2)

ax2.title.set_text("Happiness Score vs CIUpper Density")
ax2.set_xlabel("CIUpper")
ax2.set_ylabel("Happiness Score")
# plt.show()

Generally most variables/ features have a linear relationship with the Happiness score, meaning that when the value of the variable increases, the happiness score increases, and when the value of the variable decreases, the happiness score also decreases. This was also confirmed in the Density Graphs, where the country groups generally have maximum density on different values.

Although the relationship is generally linear, the line graphs are not completely linear, meaning that some countries could have a high value of a variable, but still have a slightly lower Happiness score than countries with similar values of the variable, which results in a spike like graph. 

For example:

In [None]:
# Select two countries with similar GDP
gdp_illust = {'Netherlands', 'Saudi Arabia'}
cleaned_df[cleaned_df.Country.isin(gdp_illust)]

For example, two countries with similar GDP of around 1.4 such as the Netherlands and Saudi Arabia have a difference of Happiness scores of around 1.0.

Data Analysis on Happiness Report done, now I'm going to do Modelling on the data.

In [None]:
# import libaries for modelling
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
# Define Label and Data
# Label = Score, the variable we're trying to predict
y = cleaned_df.Score
# Features/Factors
happiness_features = ['GDP', 'Family', 'LifeExpectancy', 'Corruption',
'Generosity', 'DystopiaResidual', 'CILower', 'CIUpper']
X = cleaned_df[happiness_features]

In [None]:
# Build decision tree model
decision_tree = DecisionTreeRegressor(random_state = 1)
# and Fit model
decision_tree.fit(X,y)

In [None]:
# Validate model
# Calculate mean absolute error:
dtScores = decision_tree.predict(X)
MAE_before = mean_absolute_error(y, dtScores)
# Split data into training and validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define and Fit model
decision_tree = DecisionTreeRegressor()
decision_tree.fit(train_X, train_y)
# Get predicted results
validation_predictions = decision_tree.predict(val_X)
MAE_after = mean_absolute_error(val_y, validation_predictions)
# Compare MAE
print("Decision Trees MAE before training: ", MAE_before)
print("Decision Trees MAE after training: ", MAE_after)

In [None]:
# Run Previous codes again to refresh
# Set label and data
y = cleaned_df.Score
happiness_features = ['GDP', 'Family', 'LifeExpectancy', 'Corruption',
'Generosity', 'DystopiaResidual', 'CILower', 'CIUpper']
X = cleaned_df[happiness_features]
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Build Random Forests
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
forest_preds = forest_model.predict(val_X)
# Output MAE
print("Random Forest MAE: ", mean_absolute_error(val_y, forest_preds))

Everything done on the Happiness Report, and we can conclude that GDP, Family, Life Expectancy, Dystopia Residual, CILower, and CIUpper does have a factor to the happiness scores of the countries, meaning that these factors does have an effect on an individual's Happiness. Therefore this could be improved to improve Happiness.

Not only that, Decision Trees and Random Forests models performed quite well, with MAE being quite low on both, although usually Random Forests perform better than Decision Trees.

Now that that's done, I am going to correlate Video Game Sales to happiness. 

# 3. Preprocessing on Video Game Sales and Happiness Reports

First, I am going to preprocess the game sales data by cleaning the data from empty values.

In [None]:
# load video game sales dataset and look at the head
game_df = pd.read_csv("../input/videogamesales/vgsales.csv")
game_df.head()

In [None]:
# look at number of rows with missing values
is_NaN = game_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = game_df[row_has_NaN]
# rows_with_NaN.head()
print("No. of Rows with Missing Values:", rows_with_NaN.shape[0])

In [None]:
# look where the missing values are 
game_df.isnull().sum()

So from the looks of it, null values are on Year and Publisher, with most of the null values existing in the Year Column, and some have both null. For Publisher, since I am going to only look at the sales, the Publisher won't be important here, so I simply impute the Publisher with unknown, and use year from game title, or just use median of the gaming platform as the year.

In [None]:
# impute null values on publisher with Unknown
null_pub = game_df[game_df.Publisher.isnull()]
for index, row in null_pub.iterrows():
    game_df.loc[index,'Publisher'] = 'Unknown'

In [None]:
# impute Year value with full years on the titles
null_years = game_df[game_df.Year.isnull()]

for index, row in null_years.iterrows():
    match = re.match(r'.*([1-3][0-9]{3})',row['Name'])
    if match is not None:
        game_df.loc[index,'Year'] = float(match.group(1))

In [None]:
# impute Year value with simplified years (e.g. 07 = 2007) on the titles.
# but first, look at the games with two double digit numbers on the title

null_years = game_df[game_df.Year.isnull()]

for index, row in null_years.iterrows():
    match = re.match(r'.*([0-9]{2})',row['Name'])
    if match is not None:
        currentYear = float(match.group(1))
        if (currentYear > 80 or currentYear < 17):
            print ("Index:", index, ", Title:", row['Name'], ",Year:", currentYear)
#             game_df.loc[index,'Year'] = currentYear

Some of the numbers here are not exactly years, so look at the games and manually analyse those. After analysis, turns out that:
* Index: 12015, Title: Drake of the 99 Dragons, actual year: 2003
* Index: 6283, Title: Indy 500, actual Year: 1977

In [None]:
# and impute these games Year values with its actual year
game_df.loc[12015,'Year'] = float(2003)
game_df.loc[6283,'Year'] = float(1977)

In [None]:
# impute Year value with simplified years before the millenium (e.g. 99 = 1999) on the titles.

null_years = game_df[game_df.Year.isnull()]

for index, row in null_years.iterrows():
    match = re.match(r'.*([0-9]{2})',row['Name'])
    if match is not None:
        currentYear = float(match.group(1))
        if (currentYear > 80):
            currentYear = 1900+ currentYear
            print ("Index:", index, ", Title:", row['Name'], ",Year:", currentYear)
            game_df.loc[index,'Year'] = float(currentYear)

In [None]:
# impute Year value with simplified years (e.g. 07 = 2007) on the titles.

null_years = game_df[game_df.Year.isnull()]

for index, row in null_years.iterrows():
    match = re.match(r'.*([0-9]{2})',row['Name'])
    if match is not None:
        currentYear = float(match.group(1))
        if (currentYear < 17):
            currentYear = 2000+ currentYear
            print ("Index:", index, ", Title:", row['Name'], ",Year:", currentYear)
            game_df.loc[index,'Year'] = float(currentYear)

In [None]:
# look thru remaining empty Years

null_years = game_df[game_df.Year.isnull()]

null_years

In [None]:
# get unique Platforms in the dataset
game_df.Platform.unique()

In [None]:
# impute using median years of the console its released on
# Median calculated manually using the Console Lifespans from Wikipedia

avgYears = {'Wii': 2009, 'NES':1990, 'GB':1996, 'DS':2008, 'X360':2011, 'PS3':2012, 'PS2':2006, 'SNES':1996, 'GBA':2006,
            '3DS':2013, 'PS4':2014, 'N64':1999, 'PS':2000, 'XB':2005, 'PC': 2016, '2600':1985, 'PSP':2009, 'XOne':2016, 
            'GC':2004,'WiiU':2014, 'GEN':1993, 'DC':1999, 'PSV':2015, 'SAT':1997, 'SCD':1994, 'WS':2001, 'NG':1997, 
            'TG16':1990,'3DO':1995, 'GG':1993, 'PCFX':1996}

In [None]:
null_years = game_df[game_df.Year.isnull()]

for index, row in null_years.iterrows():
    game_df.loc[index,'Year'] = float(avgYears[game_df.iloc[index].Platform])

In [None]:
# check if there is empty left
game_df.isnull().sum()

All empty values are handled! Now let us look at the data

In [None]:
# describe the dataframe
game_sum = game_df.describe()
game_sum = game_sum.transpose()
game_sum.head()

As you can see from the summary, this dataset is last updated in 2016, but there are some games that have a release year beyond that. Let's analyse it even further

In [None]:
# look at the game list sorted based on years descending
game_df.sort_values(by=['Year'],ascending=False)

As you can see, only one game is from 2020, which is a Nintendo DS game called "Imagine: Make up artist". This is weird because the NDS was discontinued in 2013, but they make a game for it in 2020? That's definitely invalid. Not only that, the video game sales dataset was last updated in October 2016, which means this data is definetly invalid, possibly also games that has a release year beyond 2016. 

For these games, I need to do further analysis. When looked at in Google, "Imagine: Make up artist" is actually a game released in 2009, so I will set the year value to 2009 for this row. "Brothers Conflict: Precious Baby" was released in 2016, and "Phantasy Star Online 2 Episode 4: Deluxe Package" were also released in 2016, so I am going to set the years to the actual Years on these games.

In [None]:
# Fix invalid data by its index.
game_df.loc[5957,'Year'] = 2009
game_df.loc[16438,'Year'] = 2016
game_df.loc[14390,'Year'] = 2016
game_df.loc[16241,'Year'] = 2016

# or optionally drop it
# game_df.drop([5957])
# game_df.drop([16438])
# game_df.drop([14390])
# game_df.drop([16241])

Now let's look at the recent games, which are games that are released on consoles that are still active at the time of the sales dataset last update, 2016. Active consoles are retrieved manually from Wikipedia. If console lifespan is until > 2016, count that console as active.

In [None]:
active_consoles = ['3DS', 'PS4', 'PC', 'XOne', 'WiiU', 'PSV']
recent_games = game_df[game_df.Platform.isin(active_consoles)]
recent_games.head()

In [None]:
# function for getting the sum of the sales of each region
def getSumSales(df):
    na_sum = df['NA_Sales'].sum()
    eu_sum = df['EU_Sales'].sum()
    jp_sum = df['JP_Sales'].sum()
    other_sum = df['Other_Sales'].sum()
    global_sum = df['Global_Sales'].sum()

    return na_sum, eu_sum, jp_sum, other_sum, global_sum

In [None]:
# function to make new data frame of sum sales and append the sums into the df
def makeSumDF(df):
    na_sum, eu_sum, jp_sum, other_sum, global_sum = getSumSales(df)
    newSumDF = pd.DataFrame(columns = ['Country','Sales'])

    newSumDF = newSumDF.append({'Country': 'North America','Sales': na_sum},ignore_index = True)
    newSumDF = newSumDF.append({'Country': 'European Union','Sales': eu_sum},ignore_index = True)
    newSumDF = newSumDF.append({'Country': 'Japan','Sales': jp_sum},ignore_index = True)
    newSumDF = newSumDF.append({'Country': 'Rest of the World','Sales': other_sum},ignore_index = True)
    newSumDF = newSumDF.append({'Country': 'Global','Sales': global_sum},ignore_index = True)
    return newSumDF

In [None]:
# make recent sales dataframe 
recent_sumSales = makeSumDF(recent_games)
recent_sumSales

In [None]:
# make all time sales dataframe 
sumSales = makeSumDF(game_df)
sumSales

Preprocess the Happiness dataframes for analysis with the Video Game Sales and make new dataframe for the combined values

In [None]:
# make new empty dataframe just for the regions in the video game sales dataset
happiness_regions = pd.DataFrame(columns = happiness.columns)

In [None]:
# drop unused features and simplify feature names from 2016 Happiness dataset
happiness = happiness.drop(columns=['Region', 'Rank'])

In [None]:
# Combine the Happiness report countries to follow the game sales regions
jpn_whr = happiness[happiness.Country == 'Japan']
jpn_whr

In [None]:
# get the countries in the regions and get its subset for insertion in new dataset later on

na_countries = ['Canada', 'United States', 'Mexico']
na_whr = happiness[happiness.Country.isin(na_countries)]
na_whr

In [None]:
happiness.columns

In [None]:
# append the regional values into the new dataframe
happiness_regions = happiness_regions.append({'Country': 'North America', 
                                              'Score': na_whr['Score'].mean(), 
                                              'CILower': na_whr['CILower'].mean(), 
                                              'CIUpper': na_whr['CIUpper'].mean(), 
                                              'GDP': na_whr['GDP'].mean(), 
                                              'Family': na_whr['Family'].mean(), 
                                              'LifeExpectancy': na_whr['LifeExpectancy'].mean(), 
                                              'Freedom': na_whr['Freedom'].mean(), 
                                              'Corruption': na_whr['Corruption'].mean(), 
                                              'Generosity': na_whr['Generosity'].mean(), 
                                              'DystopiaResidual': na_whr['DystopiaResidual'].mean()},
                                             ignore_index = True)

For EU region, for simplicity, I am going to use European Union member countries for it. I am going to count in the European Union Members in 2016, before Brexit (which officially happened in January 2020). From the [EU Official Page](https://europa.eu/european-union/about-eu/countries_en), EU members were: 
Austria, Belgium, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland,	France, Germany, Greece, Hungary, Ireland, Italy, Latvia, Lithuania, Luxembourg, Malta, Netherlands, Poland, Portugal, Romania, Slovakia, Slovenia, Spain, Sweden, United Kingdom.

In [None]:
EU_countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 
                'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 
                'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 
                'Spain', 'Sweden', 'United Kingdom']

EU_whr = happiness[happiness.Country.isin(EU_countries)]
EU_whr

In [None]:
happiness_regions = happiness_regions.append({'Country': 'European Union', 
                                              'Score': EU_whr['Score'].mean(), 
                                              'CILower': EU_whr['CILower'].mean(), 
                                              'CIUpper': EU_whr['CIUpper'].mean(), 
                                              'GDP': EU_whr['GDP'].mean(), 
                                              'Family': EU_whr['Family'].mean(), 
                                              'LifeExpectancy': EU_whr['LifeExpectancy'].mean(), 
                                              'Freedom': EU_whr['Freedom'].mean(), 
                                              'Corruption': EU_whr['Corruption'].mean(), 
                                              'Generosity': EU_whr['Generosity'].mean(), 
                                              'DystopiaResidual': EU_whr['DystopiaResidual'].mean()},
                                             ignore_index = True)

In [None]:
happiness_regions = happiness_regions.append(jpn_whr, ignore_index = True)

In [None]:
# add in rest of the world, which is every countries that does not belong in NA, EU or Japan

game_regions = na_countries + EU_countries 
game_regions.append('Japan')
RestofWorld = happiness[~happiness.Country.isin(game_regions)]
RestofWorld

In [None]:
happiness_regions = happiness_regions.append({'Country': 'Rest of the World', 
                                              'Score': RestofWorld['Score'].mean(), 
                                              'CILower': RestofWorld['CILower'].mean(), 
                                              'CIUpper': RestofWorld['CIUpper'].mean(), 
                                              'GDP': RestofWorld['GDP'].mean(), 
                                              'Family': RestofWorld['Family'].mean(), 
                                              'LifeExpectancy': RestofWorld['LifeExpectancy'].mean(), 
                                              'Freedom': RestofWorld['Freedom'].mean(), 
                                              'Corruption': RestofWorld['Corruption'].mean(), 
                                              'Generosity': RestofWorld['Generosity'].mean(), 
                                              'DystopiaResidual': RestofWorld['DystopiaResidual'].mean()},
                                             ignore_index = True)

In [None]:
happiness_regions = happiness_regions.append({'Country': 'Global', 
                                              'Score': happiness['Score'].mean(), 
                                              'CILower': happiness['CILower'].mean(), 
                                              'CIUpper': happiness['CIUpper'].mean(), 
                                              'GDP': happiness['GDP'].mean(), 
                                              'Family': happiness['Family'].mean(), 
                                              'LifeExpectancy': happiness['LifeExpectancy'].mean(), 
                                              'Freedom': happiness['Freedom'].mean(), 
                                              'Corruption': happiness['Corruption'].mean(), 
                                              'Generosity': happiness['Generosity'].mean(), 
                                              'DystopiaResidual': happiness['DystopiaResidual'].mean()},
                                             ignore_index = True)

In [None]:
happiness_regions

In [None]:
# make combined region dataframe from cleaned dataframe
cleaned_regions = pd.DataFrame(columns = cleaned_df.columns)

In [None]:
na_cleaned = cleaned_df[cleaned_df.Country.isin(na_countries)]
# na_cleaned
eu_cleaned = cleaned_df[cleaned_df.Country.isin(EU_countries)]
# eu_cleaned
jpn_cleaned = cleaned_df[cleaned_df.Country == 'Japan']
# jpn_cleaned
other_cleaned = cleaned_df[~cleaned_df.Country.isin(game_regions)]
# other_cleaned

In [None]:
cleaned_df.columns

In [None]:
# and append them into the new dataframe
cleaned_regions = cleaned_regions.append({'Country': 'North America',
                                          'Score': na_cleaned['Score'].mean(), 
                                          'GDP': na_cleaned['GDP'].mean(),
                                          'Family': na_cleaned['Family'].mean(), 
                                          'LifeExpectancy': na_cleaned['LifeExpectancy'].mean(), 
                                          'Corruption': na_cleaned['Corruption'].mean(),
                                          'Generosity': na_cleaned['Generosity'].mean(), 
                                          'DystopiaResidual': na_cleaned['DystopiaResidual'].mean(),
                                          'CILower': na_cleaned['CILower'].mean(),
                                          'CIUpper': na_cleaned['CIUpper'].mean()},
                                         ignore_index = True)

In [None]:
cleaned_regions = cleaned_regions.append({'Country': 'European Union',
                                          'Score': eu_cleaned['Score'].mean(), 
                                          'GDP': eu_cleaned['GDP'].mean(),
                                          'Family': eu_cleaned['Family'].mean(), 
                                          'LifeExpectancy': eu_cleaned['LifeExpectancy'].mean(), 
                                          'Corruption': eu_cleaned['Corruption'].mean(),
                                          'Generosity': eu_cleaned['Generosity'].mean(), 
                                          'DystopiaResidual': eu_cleaned['DystopiaResidual'].mean(),
                                          'CILower': eu_cleaned['CILower'].mean(),
                                          'CIUpper': eu_cleaned['CIUpper'].mean()},
                                         ignore_index = True)

In [None]:
cleaned_regions = cleaned_regions.append({'Country': 'Japan',
                                          'Score': jpn_cleaned['Score'].mean(), 
                                          'GDP': jpn_cleaned['GDP'].mean(),
                                          'Family': jpn_cleaned['Family'].mean(), 
                                          'LifeExpectancy': jpn_cleaned['LifeExpectancy'].mean(), 
                                          'Corruption': jpn_cleaned['Corruption'].mean(),
                                          'Generosity': jpn_cleaned['Generosity'].mean(), 
                                          'DystopiaResidual': jpn_cleaned['DystopiaResidual'].mean(),
                                          'CILower': jpn_cleaned['CILower'].mean(),
                                          'CIUpper': jpn_cleaned['CIUpper'].mean()},
                                         ignore_index = True)

In [None]:
cleaned_regions = cleaned_regions.append({'Country': 'Rest of the World',
                                          'Score': other_cleaned['Score'].mean(), 
                                          'GDP': other_cleaned['GDP'].mean(),
                                          'Family': other_cleaned['Family'].mean(), 
                                          'LifeExpectancy': other_cleaned['LifeExpectancy'].mean(), 
                                          'Corruption': other_cleaned['Corruption'].mean(),
                                          'Generosity': other_cleaned['Generosity'].mean(), 
                                          'DystopiaResidual': other_cleaned['DystopiaResidual'].mean(),
                                          'CILower': other_cleaned['CILower'].mean(),
                                          'CIUpper': other_cleaned['CIUpper'].mean()},
                                         ignore_index = True)

In [None]:
cleaned_regions = cleaned_regions.append({'Country': 'Global',
                                          'Score': cleaned_df['Score'].mean(), 
                                          'GDP': cleaned_df['GDP'].mean(),
                                          'Family': cleaned_df['Family'].mean(), 
                                          'LifeExpectancy': cleaned_df['LifeExpectancy'].mean(), 
                                          'Corruption': cleaned_df['Corruption'].mean(),
                                          'Generosity': cleaned_df['Generosity'].mean(), 
                                          'DystopiaResidual': cleaned_df['DystopiaResidual'].mean(),
                                          'CILower': cleaned_df['CILower'].mean(),
                                          'CIUpper': cleaned_df['CIUpper'].mean()},
                                         ignore_index = True)

In [None]:
cleaned_regions

# 4. Data Analysis on Video Game Sales and Happiness Report

In [None]:
# use list for color mapping graphs for each regions
regionCmap = {' North America': 'red', 
              ' European Union': 'blue', 
              ' Japan': 'salmon', 
              ' Rest of the World': 'goldenrod', 
              ' Global':'green'}

In [None]:
recent_happiness = pd.DataFrame(recent_sumSales)
recent_happiness['Score'] = happiness_regions['Score']
recent_happiness

In [None]:
x = happiness_regions['Score']
y = recent_sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Recent Game Sales vs 2016 Happiness Score")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")

# plot every points as scatter graph
for pointx, pointy in zip(x,y):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)
    
    # annotate labels on the datapoints
    plt.annotate(label,
                 (pointx, pointy),
                 textcoords="offset points",
                 xytext=(0,5),
                 ha='center')
ax.legend()
plt.show()

In [None]:
all_happiness = pd.DataFrame(sumSales)
all_happiness['Score'] = happiness_regions['Score']
all_happiness

In [None]:
x = happiness_regions['Score']
y = sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Recent Game Sales vs 2016 Happiness Score")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")

# plot every points as scatter graph
for pointx, pointy in zip(x,y):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)
    
    # annotate labels on the datapoints
    plt.annotate(label,
                 (pointx, pointy),
                 textcoords="offset points",
                 xytext=(0,5),
                 ha='center')
ax.legend()
plt.show()

In [None]:
recent_cleaned = pd.DataFrame(recent_sumSales)
recent_cleaned['Score'] = cleaned_regions['Score']
recent_cleaned

In [None]:
x = cleaned_regions['Score']
y = recent_sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Recent Game Sales vs Cleaned Happiness Score")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")

# plot every points as scatter graph
for pointx, pointy in zip(x,y):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)
    
    # annotate labels on the datapoints
    plt.annotate(label,
                 (pointx, pointy),
                 textcoords="offset points",
                 xytext=(0,10),
                 ha='center')
    
ax.legend()
plt.show()

In [None]:
all_cleaned = pd.DataFrame(sumSales)
all_cleaned['Score'] = cleaned_regions['Score']
all_cleaned

In [None]:
x = cleaned_regions['Score']
y = sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("All Time Game Sales vs Cleaned Happiness Score")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")

# plot every points as scatter graph
for pointx, pointy in zip(x,y):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)

    # annotate labels on the datapoints
    plt.annotate(label,
                 (pointx, pointy),
                 textcoords="offset points",
                 xytext=(0,10),
                 ha='center')

ax.legend()
plt.show()

Plot combined bar chart

In [None]:
x = happiness_regions['Score']
y = sumSales['Sales']
altY = recent_sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Game Sales vs 2016 Happiness Scores")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")

# Plot the All-time sales Bar
for pointx, pointy in zip(x,y):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    plt.bar(pointx, pointy, 0.1, color = regionCmap[label], label = label, alpha = 0.5)
    # annotate label with the Happiness Scores
    plt.annotate(round(pointx,2),
                 (pointx, pointy),
                 textcoords="offset points",
                 xytext=(0,10),
                 ha='center')

# Plot the recent sales Bar
for pointx, pointy in zip(x,altY):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    plt.bar(pointx, pointy, 0.1, color = regionCmap[label], alpha = 0.5)
    
ax.legend()
plt.show()

In [None]:
x = cleaned_regions['Score']
y = sumSales['Sales']
altY = recent_sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Game Sales vs Cleaned Happiness Scores")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")

# Plot the All-time sales Bar
for pointx, pointy in zip(x,y):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    plt.bar(pointx, pointy, 0.1, color = regionCmap[label], label = label, alpha = 0.5)
    # annotate label with the Happiness Scores
    plt.annotate(round(pointx,2),
                 (pointx, pointy),
                 textcoords="offset points",
                 xytext=(0,10),
                 ha='center')

# Plot the recent sales Bar
for pointx, pointy in zip(x,altY):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    plt.bar(pointx, pointy, 0.1, color = regionCmap[label], alpha = 0.5)
    
ax.legend()
plt.show()

# 5. Conclusions

In conclusion, some factors could possibly affect the happiness of a country or a region. Factors such as GDP, Family, Life Expectancy, CILower, and CIUpper have a linear effect on the Happiness Scores, meaning that the higher these values are, the higher the happiness scores will be. Whereas Corruption, Generosity and Dystopia Residual do not really have a big factor to the Happiness scores, as most countries are all dense on a similar value. Video Game sales do have an effect on the happiness scores of a region, the higher the game sales of a region is, the higher the happiness scores is. But globally, the Happiness Scores are still quite low, although the video game sales were high, which means Video Games definitely affect Happiness, but is not the main factor.

From the results of the analysis from this project, a conclusion could be drawn that the higher the GDP of a country, the stronger the family aspect, the higher the life expectancy, and the higher the video game sales are, the happier the population of the country/region is. Therefore, in order to increase Happiness of the population of the country/region, increasing GDP, Family, Life Expectancy, and Video Games could be done.


# 6. Appendix

Appendix contains extra graphs, and modelling attempt on the Video Game sales and Happiness Report

In [None]:
# GDP vs Happiness vs Game Sales
x = cleaned_regions['Score']
y = sumSales['Sales']
z = cleaned_regions['GDP']

fig = plt.figure()
plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)
ax = plt.axes(projection='3d')

plt.title("All Time Game Sales vs Cleaned Happiness Score and GDP")

ax.set_xlabel('Happiness Score')
ax.set_ylabel('Game Sales (in Millions)')

for pointx, pointy, pointz in zip(x,y,z):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    ax.scatter3D(pointx, pointy, pointz, c = regionCmap[label], label = label)
plt.show()

In [None]:
# GDP vs Happiness vs Game Sales
x = cleaned_regions['Score']
y = recent_sumSales['Sales']
z = cleaned_regions['GDP']

fig = plt.figure()
plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)
ax = plt.axes(projection='3d')

plt.title("Recent Game Sales vs Cleaned Happiness Score and GDP")
ax.set_xlabel('Happiness Score')
ax.set_ylabel('Game Sales (in Millions)')

for pointx, pointy, pointz in zip(x,y,z):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    ax.scatter3D(pointx, pointy, pointz, c = regionCmap[label], label = label)
plt.show()

Modelling on VGSales + Happiness Report Data

In [None]:
# try modelling

# create dataframes used for predicting
# create deep copy of cleaned_regions with addition of all sales.
cleaned_sumSales = cleaned_regions.copy(deep=True)
cleaned_sumSales['VGSales'] = sumSales['Sales']
# cleaned_sumSales

# create deep copy of cleaned_regions with addition of recent sales.
cleaned_RsumSales = cleaned_regions.copy(deep=True)
cleaned_RsumSales['VGSales'] = recent_sumSales['Sales']
# cleaned_RsumSales

In [None]:
y = cleaned_RsumSales.Score

happiness_features = ['GDP', 'Family', 'LifeExpectancy', 'Corruption',
                      'Generosity', 'DystopiaResidual', 'CILower', 'CIUpper', 'VGSales']
X = cleaned_RsumSales[happiness_features]
test_X = cleaned_sumSales[happiness_features]
X

In [None]:
test_X

In [None]:
happiness_model = DecisionTreeRegressor(random_state = 1)
# Fit model
happiness_model.fit(X,y)

In [None]:
# Calculate mean absolute error:
happiness_prices = happiness_model.predict(X)
MAE_before = mean_absolute_error(y, happiness_prices)

# Split data into training and validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Define/Fit model
happiness_model = DecisionTreeRegressor()
happiness_model.fit(train_X, train_y)

# Get predicted results
validation_predictions = happiness_model.predict(val_X)
MAE_after = mean_absolute_error(val_y, validation_predictions)

# Compare MAE
print("MAE Before training: ", MAE_before)
print("MAE After training: ", MAE_after)

So because of small size of data, prediciting is not possible, either will overfit, or underfit.

Attempted combined plot using scatter plot

In [None]:
x = cleaned_regions['Score']
y = recent_sumSales['Sales']
altX = happiness_regions['Score']
altY = sumSales['Sales']
# plt.plot(x,y,'ro')
# plt.bar(x,y, 0.1)

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Recent Game Sales vs All Time on 2016 Happiness Score")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")
for pointx, pointy in zip(x,y):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)

for pointx, pointy in zip(x,altY):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)
plt.show()

In [None]:
x = cleaned_regions['Score']
y = recent_sumSales['Sales']
altX = happiness_regions['Score']
altY = sumSales['Sales']

plt.style.use('ggplot')
fig, ax = plt.subplots()
fig.set_size_inches(7,5)

plt.title("Recent Game Sales vs All Time on Cleaned Happiness Score")
plt.xlabel("Happiness Score")
plt.ylabel("Game Sales (in Millions)")
# plt.annotate('Japan', (6,100))
for pointx, pointy in zip(x,y):
    label = recent_sumSales.Country[recent_sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)

for pointx, pointy in zip(x,altY):
    label = sumSales.Country[sumSales['Sales'] == pointy].to_string(index=False)
    plt.scatter(pointx, pointy, c = regionCmap[label], label = label)
plt.show()

Thank you for reading my notebook! Many thank to you, and also my groupmates for helping with the project.

P.S. this is my first Public Kaggle Notebook, so apalogies for any mistakes :) Also, note that for the work done on the Happiness Report, it was done by the other group members (although I also contributed for it as well), the one I used in the notebook is a simplified version of it. Whereas the video game sales were all done by myself.