## We start by importing our basic modules and data.

In [1]:
#We start by importing our basic modules and data
import pandas as pd
import numpy as np
import random
import os
from scipy import stats
import matplotlib.pyplot as plt

data_path = os.path.join('..','data','cleanData','almost_complete.csv')
data = pd.read_csv(data_path)
data.columns

Index(['Country_x', 'Country Code', 'Region', 'Poverty Level', 'Capital City',
       'Lat/Long', 'Poverty Value (1-4)', 'Country name', 'Year',
       'Life Ladder', 'Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government',
       'Democratic Quality', 'Delivery Quality',
       'Standard deviation of ladder by country-year',
       'Standard deviation/Mean of ladder by country-year',
       'GINI index (World Bank estimate)',
       'GINI index (World Bank estimate), average 2000-16',
       'gini of household income reported in Gallup, by wp5-year',
       'Most people can be trusted, Gallup',
       'Most people can be trusted, WVS round 1981-1984',
       'Most people can be trusted, WVS round 1989-1993',
       'Most people can be trusted, WVS round 1994-1998',
       'Most people can be truste

## Before we do any analysis, we want to check if our data makes sense. While our data has been cleaned and should be compelete, we want to check if there are any outliers in our data. We will do this for all columns we will be analyzing.

In [2]:
#we check for any happiness outliers
happiness = data[['Country_x', 'Year', 'Life Ladder']]
happiness = happiness.dropna()
#calculate our upper and lower bounds as per usual. Standard stuff
q1 = np.percentile(happiness['Life Ladder'], 25)
q3 = np.percentile(happiness['Life Ladder'], 75)
IQR = q3-q1
lower_bound = q1 - 1.5*IQR
upper_bound = q3 + 1.5*IQR
#now we create an upper and lower data frame and then combine them, noting whether they are upper or lower outliers
test_upper_h = happiness.loc[(happiness['Life Ladder'] > upper_bound)]
test_upper_h['Outlier type'] = "Upper"
test_lower_h = happiness.loc[(happiness['Life Ladder'] < lower_bound)]
test_lower_h['Outlier type'] = "Lower"
outliers = pd.concat([test_lower_h, test_upper_h])
outliers

Unnamed: 0,Country_x,Year,Life Ladder,Outlier type


In [3]:
#we check for any freedom outliers
freedom = data[['Country_x', 'Year', 'Freedom to make life choices']]
freedom = freedom.dropna()
#calculate our upper and lower bounds as per usual. Standard stuff
q1 = np.percentile(freedom['Freedom to make life choices'], 25)
q3 = np.percentile(freedom['Freedom to make life choices'], 75)
IQR = q3-q1
lower_bound = q1 - 1.5*IQR
upper_bound = q3 + 1.5*IQR
#now we create an upper and lower data frame and then combine them, noting whether they are upper or lower outliers
test_upper_f = freedom.loc[(freedom['Freedom to make life choices'] > upper_bound)]
test_upper_f['Outlier type'] = "Upper"
test_lower_f = freedom.loc[(freedom['Freedom to make life choices'] < lower_bound)]
test_lower_f['Outlier type'] = "Lower"
outliers = pd.concat([test_lower_f, test_upper_f])
outliers
#we ignore the warning here

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Country_x,Year,Freedom to make life choices,Outlier type
99,Burundi,2008.0,0.260069,Lower
171,Bosnia and Herzegovina,2009.0,0.257534,Lower
356,Cuba,2006.0,0.281458,Lower
640,Haiti,2016.0,0.30354,Lower
707,Iraq,2012.0,0.314565,Lower
1442,Chad,2006.0,0.306132,Lower
1443,Chad,2007.0,0.294612,Lower
1456,Togo,2008.0,0.286814,Lower


In [4]:
#we check for any Log GDP per capita outliers
LGDPperC = data[['Country_x', 'Year', 'Log GDP per capita']]
LGDPperC = LGDPperC.dropna()
#calculate our upper and lower bounds as per usual. Standard stuff
q1 = np.percentile(LGDPperC['Log GDP per capita'], 25)
q3 = np.percentile(LGDPperC['Log GDP per capita'], 75)
IQR = q3-q1
lower_bound = q1 - 1.5*IQR
upper_bound = q3 + 1.5*IQR
#now we create an upper and lower data frame and then combine them, noting whether they are upper or lower outliers
test_upper_g = LGDPperC.loc[(LGDPperC['Log GDP per capita'] > upper_bound)]
test_upper_g['Outlier type'] = "Upper"
test_lower_g = LGDPperC.loc[(LGDPperC['Log GDP per capita'] < lower_bound)]
test_lower_g['Outlier type'] = "Lower"
outliers = pd.concat([test_lower_g, test_upper_g])
outliers

Unnamed: 0,Country_x,Year,Log GDP per capita,Outlier type


In [5]:
#we check for any Electronic Sales Revenue per capita outliers
ER = data[['Country_x', 'Year', 'Revenue ($M)', 'Population']]
ER = ER.dropna()
ERperC = ER[['Country_x', 'Year']]
ERperC['Revenue per capita'] = ER['Revenue ($M)']*1000000/ER['Population']

#calculate our upper and lower bounds as per usual. Standard stuff
q1 = np.percentile(ERperC['Revenue per capita'], 25)
q3 = np.percentile(ERperC['Revenue per capita'], 75)
IQR = q3-q1
lower_bound = q1 - 1.5*IQR
upper_bound = q3 + 1.5*IQR
#now we create an upper and lower data frame and then combine them, noting whether they are upper or lower outliers
test_upper_er = ERperC.loc[(ERperC['Revenue per capita'] > upper_bound)]
test_upper_er['Outlier type'] = "Upper"
test_lower_er = ERperC.loc[(ERperC['Revenue per capita'] < lower_bound)]
test_lower_er['Outlier type'] = "Lower"
outliers = pd.concat([test_lower_er, test_upper_er])
outliers
#ignore warnings again

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Country_x,Year,Revenue per capita,Outlier type
84,Austria,2017.0,132.780013,Upper
85,Austria,2018.0,140.345732,Upper
252,Canada,2017.0,139.642887,Upper
253,Canada,2018.0,144.408846,Upper
260,Switzerland,2017.0,203.403655,Upper
261,Switzerland,2018.0,210.851743,Upper
396,Germany,2017.0,140.028527,Upper
397,Germany,2018.0,161.963366,Upper
413,Denmark,2017.0,158.544974,Upper
414,Denmark,2018.0,167.340514,Upper


# We've found a number of outliers for freedom and Electronic Revenue Data, however we won't get rid of these from our analysis as there's nothing really wrong with these values and they could have genuine ramifications that we might be interested in.

# Now, before we do any statistical analysis such as t-tests, does our data match the required conditions for running those? We'll be running Welch's t-tests so we don't need to worry about variances, and while we don't know if our data is normalized, we actually don't need to worry about it in this case. Because our sample size is large relative to our population, we can consider the Central Limit Theorem and normality of our sample sums follows trivially, despite a possible lack of normality in our original values. Thus, we can continue and carry out our analysis

## We want to see whether GDP or freedom correlates better with happiness.
So we do linear regression and find p-values for happiness vs Log GDP per capita and happiness vs freedom

In [6]:
#happiness vs Log GDP per capita
hapvsGDP = data[['Life Ladder', 'Log GDP per capita']]
hapvsGDP = hapvsGDP.dropna(how = 'any')
happiness = hapvsGDP['Life Ladder']
LGDPperC = hapvsGDP['Log GDP per capita']
#Do some linear regression to get p values (as t-test is included in it)
(slope, intercept, r, p, stderr) = stats.linregress(happiness, LGDPperC)
print("Happiness vs Log GDP per capita:")
print("slope = " + str(slope))
print("intercept = " + str(intercept))
print("r = " + str(r))
print("p = " + str(p))
print("standard error = " + str(stderr))

Happiness vs Log GDP per capita:
slope = 0.822772548902285
intercept = 4.742404978293741
r = 0.7792201240864797
p = 0.0
standard error = 0.016228115581241473


In [7]:
#happiness vs freedom
hapvsFree = data[['Life Ladder', 'Freedom to make life choices']]
hapvsFree = hapvsFree.dropna(how = 'any')
happiness = hapvsFree['Life Ladder']
freedom = hapvsFree['Freedom to make life choices']
#do linear regression
(slope, intercept, r, p, stderr) = stats.linregress(happiness, freedom)
print("Happiness vs Freedom:")
print("slope = " + str(slope))
print("intercept = " + str(intercept))
print("r = " + str(r))
print("p = " + str(p))
print("standard error = " + str(stderr))

Happiness vs Freedom:
slope = 0.0672601670844012
intercept = 0.36842487510627075
r = 0.5243317769244311
p = 6.758162274825783e-118
standard error = 0.002682571592781182


# From this we can conclude that while both freedom and Log GDP per capita correlate incredibly closely with happiness, Log GDP per capita correlates a little more closely than freedom does!

In [8]:
#for funsies, I'll go ahead and see how well GDP and freedom correlate
#happiness vs freedom
FreevsGDP = data[['Freedom to make life choices', 'Log GDP per capita']]
FreevsGDP = FreevsGDP.dropna(how = 'any')
freedom = FreevsGDP['Freedom to make life choices']
LGDPperC = FreevsGDP['Log GDP per capita']
#do linear regression
(slope, intercept, r, p, stderr) = stats.linregress(freedom, LGDPperC)
print("Freedom vs Log GDP per capita:")
print("slope = " + str(slope))
print("intercept = " + str(intercept))
print("r = " + str(r))
print("p = " + str(p))
print("standard error = " + str(stderr))
#we see that they correlate very closely, but not as well as the above combinations do

Freedom vs Log GDP per capita:
slope = 2.9456891299620356
intercept = 7.0538711175660485
r = 0.35885506329301525
p = 6.671643451725446e-51
standard error = 0.189542447678058


## Now we want to see whether Electronic Revenue per capita correlates well with happiness

In [9]:
#happiness vs Electronic Revenue per capita
hapvsERperC = data[['Life Ladder', 'Revenue ($M)', 'Population']]
hapvsERperC = hapvsERperC.dropna(how = 'any')
happiness = hapvsERperC['Life Ladder']
#we calculate electronic revenue per capita and change from in $M to $ so our numbers make more sense
ERperC = hapvsERperC['Revenue ($M)']*1000000/hapvsERperC['Population']
#do our linear regression for the t-test
(slope, intercept, r, p, stderr) = stats.linregress(happiness, ERperC)
print(slope)
print(intercept)
print(r)
print(p)
print(stderr)

35.65289979766012
-160.5056560453105
0.704240919583214
1.6325007046398953e-39
2.25968488188548


# We see that there is a strong correlation between Happiness and Electronic Revenue per capita, although not as strong a correlation as there was for Freedom or Log GDP per capita