# Table of Contents

## 1. Importing Libraries and Data
## 2. Exploring and Cleaning Data
## 3. Consistency Checks

# 1. Importing Libaries and Data

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import os

In [2]:
# importing data

path = r'C:\Users\Charles Yi\A6\02 Data\Original'

df = pd.read_csv(os.path.join(path, 'World Happiness Report.csv'))

In [3]:
# getting rid of column and row restrictions

pd.options.display.max_rows = None
pd.options.display.max_columns = None

# 2. Exploring and Cleaning Data

## Isolating Last Ten Years of Data

In [4]:
# will consider last ten years of data for relevancy

df = df[df['Year'] >= 2012]

df.head()

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
4,Afghanistan,South Asia,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919,0.43544
5,Afghanistan,South Asia,2013,3.5721,7.680333,0.483552,52.0,0.577955,0.062666,0.823204,0.547417,0.273328,0.482847
6,Afghanistan,South Asia,2014,3.130896,7.670638,0.525568,52.299999,0.508514,0.105755,0.871242,0.491641,0.374861,0.409048
7,Afghanistan,South Asia,2015,3.982855,7.653833,0.528597,52.599998,0.388928,0.081652,0.880638,0.49141,0.339276,0.260557
8,Afghanistan,South Asia,2016,4.220169,7.65037,0.559072,52.924999,0.522566,0.043916,0.793246,0.501409,0.348332,0.32499


## Data Profiling

In [5]:
df.head()

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
4,Afghanistan,South Asia,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919,0.43544
5,Afghanistan,South Asia,2013,3.5721,7.680333,0.483552,52.0,0.577955,0.062666,0.823204,0.547417,0.273328,0.482847
6,Afghanistan,South Asia,2014,3.130896,7.670638,0.525568,52.299999,0.508514,0.105755,0.871242,0.491641,0.374861,0.409048
7,Afghanistan,South Asia,2015,3.982855,7.653833,0.528597,52.599998,0.388928,0.081652,0.880638,0.49141,0.339276,0.260557
8,Afghanistan,South Asia,2016,4.220169,7.65037,0.559072,52.924999,0.522566,0.043916,0.793246,0.501409,0.348332,0.32499


In [6]:
df.shape

(1487, 13)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1487 entries, 4 to 2198
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Country Name                       1487 non-null   object 
 1   Regional Indicator                 1422 non-null   object 
 2   Year                               1487 non-null   int64  
 3   Life Ladder                        1487 non-null   float64
 4   Log GDP Per Capita                 1472 non-null   float64
 5   Social Support                     1482 non-null   float64
 6   Healthy Life Expectancy At Birth   1448 non-null   float64
 7   Freedom To Make Life Choices       1468 non-null   float64
 8   Generosity                         1454 non-null   float64
 9   Perceptions Of Corruption          1402 non-null   float64
 10  Positive Affect                    1475 non-null   float64
 11  Negative Affect                    1478 non-null   float

In [8]:
df.describe()

Unnamed: 0,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
count,1487.0,1487.0,1472.0,1482.0,1448.0,1468.0,1454.0,1402.0,1475.0,1478.0,1216.0
mean,2016.816409,5.492837,9.44091,0.808157,63.924216,0.765428,0.000488,0.736933,0.653814,0.282949,0.480583
std,3.092874,1.135839,1.147415,0.121116,6.316175,0.131354,0.16011,0.18522,0.105384,0.089911,0.192934
min,2012.0,1.281271,5.526723,0.228217,38.639999,0.30354,-0.337527,0.047311,0.178886,0.082737,0.07971
25%,2014.0,4.64675,8.543158,0.740641,59.56875,0.686217,-0.115692,0.680484,0.575918,0.214267,0.326856
50%,2017.0,5.501249,9.545262,0.832067,65.460003,0.783211,-0.022774,0.793098,0.664399,0.273405,0.459543
75%,2019.0,6.32112,10.409705,0.904792,68.931252,0.870619,0.094733,0.861789,0.737092,0.337243,0.617203
max,2022.0,7.88935,11.663788,0.987343,74.474998,0.985178,0.702708,0.97634,0.883586,0.70459,0.993604


Min, max, and mean don't show any notable aberrations

## Mixed Columns

In [9]:
# checking for mixed columns

for col in df.columns.tolist():
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df[weird]) > 0:
        print (col)

Regional Indicator


In [10]:
# examining regional indicator col

df['Regional Indicator'].value_counts(dropna = False)

Sub-Saharan Africa                    311
Western Europe                        214
Latin America and Caribbean           207
Central and Eastern Europe            171
Middle East and North Africa          144
Commonwealth of Independent States    121
Southeast Asia                         86
NaN                                    65
South Asia                             63
East Asia                              61
North America and ANZ                  44
Name: Regional Indicator, dtype: int64

In [11]:
# examining countries with NaN regional indicator

df_regional_null = df[df['Regional Indicator'].isnull() == True]

df_regional_null

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
40,Angola,,2012,4.36025,8.988708,0.752593,51.84,0.456029,-0.13863,0.9063,0.590808,0.30489,0.237091
41,Angola,,2013,3.937107,8.999667,0.721591,52.459999,0.409555,-0.105992,0.816375,0.650047,0.370875,0.547732
42,Angola,,2014,3.794838,9.009897,0.754615,53.080002,0.374542,-0.170031,0.834076,0.595249,0.367864,0.572346
180,Belize,,2014,5.955647,9.13522,0.756932,65.0,0.873569,0.0011,0.782105,0.735027,0.281604,0.384267
195,Bhutan,,2013,5.569092,9.097065,0.818949,62.240002,0.810201,0.351611,0.802428,0.663676,0.21735,0.979501
196,Bhutan,,2014,4.938578,9.143029,0.880342,62.419998,0.834222,0.266661,0.650338,0.774639,0.324098,0.958492
197,Bhutan,,2015,5.082129,9.197989,0.847574,62.599998,0.830102,0.275951,0.633956,0.723233,0.311589,0.946393
347,Central African Republic,,2016,2.693061,6.707346,0.290184,44.75,0.624057,0.037318,0.859073,0.550785,0.494268,0.748755
348,Central African Republic,,2017,3.475862,6.732925,0.319589,45.299999,0.645252,0.077883,0.889566,0.602205,0.599335,0.650285
436,Congo (Kinshasa),,2012,4.639227,6.803482,0.769546,50.900002,0.557286,-0.033436,0.807407,0.62613,0.229651,0.43757


In [12]:
# examing regional indicators of countries with NaN

df[df['Country Name'].isin(['Angola', 'Belize', 'Bhutan', 'Central African Republic', 'Congo (Kinshasa)', 'Czechia', 'Eswatini', 'Qatar', 'Somalia', 'Somaliland region', 'South Sudan', 'State of Palestine', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkiye'])]

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
40,Angola,,2012,4.36025,8.988708,0.752593,51.84,0.456029,-0.13863,0.9063,0.590808,0.30489,0.237091
41,Angola,,2013,3.937107,8.999667,0.721591,52.459999,0.409555,-0.105992,0.816375,0.650047,0.370875,0.547732
42,Angola,,2014,3.794838,9.009897,0.754615,53.080002,0.374542,-0.170031,0.834076,0.595249,0.367864,0.572346
180,Belize,,2014,5.955647,9.13522,0.756932,65.0,0.873569,0.0011,0.782105,0.735027,0.281604,0.384267
195,Bhutan,,2013,5.569092,9.097065,0.818949,62.240002,0.810201,0.351611,0.802428,0.663676,0.21735,0.979501
196,Bhutan,,2014,4.938578,9.143029,0.880342,62.419998,0.834222,0.266661,0.650338,0.774639,0.324098,0.958492
197,Bhutan,,2015,5.082129,9.197989,0.847574,62.599998,0.830102,0.275951,0.633956,0.723233,0.311589,0.946393
347,Central African Republic,,2016,2.693061,6.707346,0.290184,44.75,0.624057,0.037318,0.859073,0.550785,0.494268,0.748755
348,Central African Republic,,2017,3.475862,6.732925,0.319589,45.299999,0.645252,0.077883,0.889566,0.602205,0.599335,0.650285
436,Congo (Kinshasa),,2012,4.639227,6.803482,0.769546,50.900002,0.557286,-0.033436,0.807407,0.62613,0.229651,0.43757


In [13]:
# replacing NaN with regional indicator ; testing on separate df

# Angola: Sub-Saharan Africa
# Central African Republic: Sub-Saharan Africa
# Congo (Kinshasa): Sub-Saharan Africa
# Eswatini: Sub-Saharan Africa
# Somalia: Sub-Saharan Africa
# South Sudan: Sub-Saharan Africa
# Sudan: Sub-Saharan Africa
# Belize: Latin America and Caribbean
# Suriname: Latin America and Caribbean
# Trinidad and Tobago: Latin America and Caribbean
# Bhutan: South Asia
# State of Palestine: Middle East and North Africa
# Syria: Middle East and North Africa
# Qatar: Middle East and North Africa
# Czechia: Central and Eastern Europe
# Somaliland region: Sub-Saharan Africa
# Turkiye: Southeast Asia (assuming you meant Turkey)
    
def region(country):
        if country in ['Angola', 'Central African Republic', 'Congo (Kinshasa)', 'Eswatini', 'Somalia', 'South Sudan', 'Sudan', 'Somaliland', 'Somaliland region']:
            return 'Sub-Saharan Africa'
        elif country in ['Belize', 'Suriname', 'Trinidad and Tobago']:
            return 'Latin America and Caribbean'
        elif country in ['Bhutan']:
            return 'South Asia'
        elif country in ['State of Palestine', 'Syria', 'Qatar']:
            return 'Middle East and North Africa'
        elif country in ['Czechia']:
            return 'Central and Eastern Europe'
        elif country in ['Turkiye']:
            return 'Southeast Asia'
        else:
            return df[df['Country Name'] == country]['Regional Indicator'].values[0]

        
df_regional_null['Regional Indicator'] = df_regional_null['Country Name'].apply(region)
                                                
df_regional_null

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_regional_null['Regional Indicator'] = df_regional_null['Country Name'].apply(region)


Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
40,Angola,Sub-Saharan Africa,2012,4.36025,8.988708,0.752593,51.84,0.456029,-0.13863,0.9063,0.590808,0.30489,0.237091
41,Angola,Sub-Saharan Africa,2013,3.937107,8.999667,0.721591,52.459999,0.409555,-0.105992,0.816375,0.650047,0.370875,0.547732
42,Angola,Sub-Saharan Africa,2014,3.794838,9.009897,0.754615,53.080002,0.374542,-0.170031,0.834076,0.595249,0.367864,0.572346
180,Belize,Latin America and Caribbean,2014,5.955647,9.13522,0.756932,65.0,0.873569,0.0011,0.782105,0.735027,0.281604,0.384267
195,Bhutan,South Asia,2013,5.569092,9.097065,0.818949,62.240002,0.810201,0.351611,0.802428,0.663676,0.21735,0.979501
196,Bhutan,South Asia,2014,4.938578,9.143029,0.880342,62.419998,0.834222,0.266661,0.650338,0.774639,0.324098,0.958492
197,Bhutan,South Asia,2015,5.082129,9.197989,0.847574,62.599998,0.830102,0.275951,0.633956,0.723233,0.311589,0.946393
347,Central African Republic,Sub-Saharan Africa,2016,2.693061,6.707346,0.290184,44.75,0.624057,0.037318,0.859073,0.550785,0.494268,0.748755
348,Central African Republic,Sub-Saharan Africa,2017,3.475862,6.732925,0.319589,45.299999,0.645252,0.077883,0.889566,0.602205,0.599335,0.650285
436,Congo (Kinshasa),Sub-Saharan Africa,2012,4.639227,6.803482,0.769546,50.900002,0.557286,-0.033436,0.807407,0.62613,0.229651,0.43757


In [40]:
# applying to original df

df['Regional Indicator'] = df['Country Name'].apply(region)

df.head(100)

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
4,Afghanistan,South Asia,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919,0.43544
5,Afghanistan,South Asia,2013,3.5721,7.680333,0.483552,52.0,0.577955,0.062666,0.823204,0.547417,0.273328,0.482847
6,Afghanistan,South Asia,2014,3.130896,7.670638,0.525568,52.299999,0.508514,0.105755,0.871242,0.491641,0.374861,0.409048
7,Afghanistan,South Asia,2015,3.982855,7.653833,0.528597,52.599998,0.388928,0.081652,0.880638,0.49141,0.339276,0.260557
8,Afghanistan,South Asia,2016,4.220169,7.65037,0.559072,52.924999,0.522566,0.043916,0.793246,0.501409,0.348332,0.32499
9,Afghanistan,South Asia,2017,2.661718,7.64783,0.49088,53.25,0.427011,-0.11941,0.954393,0.43527,0.371326,0.261179
10,Afghanistan,South Asia,2018,2.694303,7.630801,0.507516,53.575001,0.373536,-0.091106,0.927606,0.384561,0.404904,0.364666
11,Afghanistan,South Asia,2019,2.375092,7.640086,0.419973,53.900002,0.393656,-0.106016,0.923849,0.324108,0.502474,0.341482
12,Afghanistan,South Asia,2021,2.436034,7.324032,0.454175,54.549999,0.394306,-0.081011,0.946299,0.178886,0.606713,0.25562
13,Afghanistan,South Asia,2022,1.281271,,0.228217,54.875,0.368377,,0.733198,0.205868,0.575512,


In [15]:
# checking work by searching for Angola

df[df['Country Name'] == 'Angola']['Regional Indicator'].iloc[0]

'Sub-Saharan Africa'

## Null Values

In [16]:
# checking for null values

df.isnull().sum()

Country Name                           0
Regional Indicator                     0
Year                                   0
Life Ladder                            0
Log GDP Per Capita                    15
Social Support                         5
Healthy Life Expectancy At Birth      39
Freedom To Make Life Choices          19
Generosity                            33
Perceptions Of Corruption             85
Positive Affect                       12
Negative Affect                        9
Confidence In National Government    271
dtype: int64

In [44]:
# calculating percentage of null value per column

(df.isnull().sum()/len(df) * 100).round(1)

Country Name                          0.0
Regional Indicator                    0.0
Year                                  0.0
Life Ladder                           0.0
Log GDP Per Capita                    1.0
Social Support                        0.3
Healthy Life Expectancy At Birth      2.6
Freedom To Make Life Choices          1.3
Generosity                            2.2
Perceptions Of Corruption             5.7
Positive Affect                       0.8
Negative Affect                       0.6
Confidence In National Government    18.2
dtype: float64

'Confidence In National Government' has 18% of its values missing. May drop column before conducting analysis

In [18]:
# exploring null values

df_null = df[df.isnull().any(axis=1)]

df_null

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
13,Afghanistan,South Asia,2022,1.281271,,0.228217,54.875,0.368377,,0.733198,0.205868,0.575512,
28,Albania,Central and Eastern Europe,2022,5.212213,9.626483,0.72409,69.175003,0.80225,-0.065987,0.845502,0.547126,0.254826,
31,Algeria,Middle East and North Africa,2012,5.604596,9.329962,0.839397,65.699997,0.586663,-0.176571,0.690116,0.540059,0.229716,
32,Algeria,Middle East and North Africa,2014,6.354898,9.355415,0.818189,65.900002,,,,0.558359,0.176866,
33,Algeria,Middle East and North Africa,2016,5.340854,9.383312,0.748588,66.099998,,,,0.565026,0.377112,
34,Algeria,Middle East and North Africa,2017,5.248912,9.376658,0.806754,66.199997,0.43667,-0.171471,0.699774,0.554529,0.28871,
35,Algeria,Middle East and North Africa,2018,5.043086,9.369554,0.798651,66.300003,0.583381,-0.150559,0.758704,0.533725,0.292946,
36,Algeria,Middle East and North Africa,2019,4.744627,9.36111,0.803259,66.400002,0.385083,0.000268,0.740609,0.544428,0.215198,
37,Algeria,Middle East and North Africa,2020,5.437755,9.291438,0.867649,66.5,0.573891,-0.121148,0.724264,0.524169,0.31063,
38,Algeria,Middle East and North Africa,2021,5.217018,9.309262,0.84071,66.599998,0.558487,-0.113483,0.7119,0.498022,0.257779,


In [19]:
# counting number of countries with missing values

df_null['Country Name'].value_counts()

United Arab Emirates         11
Saudi Arabia                 11
Taiwan Province of China     11
Kosovo                       11
China                        10
Turkiye                      10
Jordan                       10
State of Palestine            9
Vietnam                       9
Morocco                       8
Turkmenistan                  8
Algeria                       8
Bahrain                       8
Hong Kong S.A.R. of China     7
Kuwait                        7
Libya                         6
Cambodia                      5
Uzbekistan                    5
Egypt                         5
Laos                          5
South Sudan                   4
Rwanda                        4
Tajikistan                    4
Bangladesh                    3
Malta                         3
Iran                          3
Venezuela                     3
Lebanon                       3
Yemen                         3
Syria                         3
Myanmar                       2
Qatar   

In [20]:
# examining countries with missing 'Perceptions of Corruption' column as it was missing more than 5 percent of data

missing_perceptions = df_null[df_null['Perceptions Of Corruption'].isnull()]

missing_perceptions

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
32,Algeria,Middle East and North Africa,2014,6.354898,9.355415,0.818189,65.900002,,,,0.558359,0.176866,
33,Algeria,Middle East and North Africa,2016,5.340854,9.383312,0.748588,66.099998,,,,0.565026,0.377112,
126,Bahrain,Middle East and North Africa,2014,6.165134,10.801981,,65.959999,,,,,,
127,Bahrain,Middle East and North Africa,2015,6.007375,10.788364,0.852551,66.199997,0.849521,0.1075,,0.653345,0.302972,
128,Bahrain,Middle East and North Africa,2016,6.169673,10.789037,0.8627,66.125,0.888691,0.083302,,0.73622,0.283466,
129,Bahrain,Middle East and North Africa,2017,6.227321,10.798135,0.875747,66.050003,0.905859,0.130106,,0.754333,0.28976,
130,Bahrain,Middle East and North Africa,2019,7.098012,10.815147,0.877929,65.900002,0.906536,0.036391,,0.711386,0.317106,
131,Bahrain,Middle East and North Africa,2020,6.173176,10.775782,0.847745,65.824997,0.945233,0.117025,,0.72951,0.296835,
305,Cambodia,Southeast Asia,2018,5.121838,8.347027,0.794605,61.299999,0.958305,0.032343,,0.723344,0.414346,
387,China,East Asia,2012,5.094917,9.246742,0.787818,67.220001,0.808255,-0.187489,,0.68916,0.158703,


In [21]:
# counting countries with missing perceptions data

missing_perceptions['Country Name'].value_counts()

United Arab Emirates    11
Saudi Arabia            11
China                   10
Jordan                   9
Turkmenistan             8
Kuwait                   7
Bahrain                  6
Egypt                    5
Vietnam                  3
Algeria                  2
Qatar                    2
Libya                    2
Malta                    2
Yemen                    2
Maldives                 1
Laos                     1
Ethiopia                 1
Uzbekistan               1
Cambodia                 1
Name: Country Name, dtype: int64

In [22]:
# examining countries with missing 'Confidence In National Government' data

df_confidence = df_null[df_null['Confidence In National Government'].isnull()]

df_confidence

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
13,Afghanistan,South Asia,2022,1.281271,,0.228217,54.875,0.368377,,0.733198,0.205868,0.575512,
28,Albania,Central and Eastern Europe,2022,5.212213,9.626483,0.72409,69.175003,0.80225,-0.065987,0.845502,0.547126,0.254826,
31,Algeria,Middle East and North Africa,2012,5.604596,9.329962,0.839397,65.699997,0.586663,-0.176571,0.690116,0.540059,0.229716,
32,Algeria,Middle East and North Africa,2014,6.354898,9.355415,0.818189,65.900002,,,,0.558359,0.176866,
33,Algeria,Middle East and North Africa,2016,5.340854,9.383312,0.748588,66.099998,,,,0.565026,0.377112,
34,Algeria,Middle East and North Africa,2017,5.248912,9.376658,0.806754,66.199997,0.43667,-0.171471,0.699774,0.554529,0.28871,
35,Algeria,Middle East and North Africa,2018,5.043086,9.369554,0.798651,66.300003,0.583381,-0.150559,0.758704,0.533725,0.292946,
36,Algeria,Middle East and North Africa,2019,4.744627,9.36111,0.803259,66.400002,0.385083,0.000268,0.740609,0.544428,0.215198,
37,Algeria,Middle East and North Africa,2020,5.437755,9.291438,0.867649,66.5,0.573891,-0.121148,0.724264,0.524169,0.31063,
38,Algeria,Middle East and North Africa,2021,5.217018,9.309262,0.84071,66.599998,0.558487,-0.113483,0.7119,0.498022,0.257779,


In [23]:
# count of countries with missing confidence data

df_confidence['Country Name'].value_counts()

United Arab Emirates        11
Saudi Arabia                11
China                       10
Jordan                      10
Turkiye                     10
State of Palestine           9
Vietnam                      9
Turkmenistan                 8
Morocco                      8
Algeria                      8
Bahrain                      8
Kuwait                       7
Libya                        6
Cambodia                     5
Egypt                        5
Laos                         5
Uzbekistan                   4
Rwanda                       4
Tajikistan                   4
Lebanon                      3
Syria                        3
Iran                         2
Qatar                        2
Myanmar                      2
Pakistan                     2
Sudan                        2
Ethiopia                     2
Guinea                       2
Belgium                      2
Burundi                      2
Bangladesh                   2
Mali                         1
Norway  

In [35]:
# countries with both 'Confidence in National Government' and 'Perceptions of Corruption' data missing

missing_both = missing_perceptions[missing_perceptions.isin(df_confidence)]

missing_both

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
32,Algeria,Middle East and North Africa,2014.0,6.354898,9.355415,0.818189,65.900002,,,,0.558359,0.176866,
33,Algeria,Middle East and North Africa,2016.0,5.340854,9.383312,0.748588,66.099998,,,,0.565026,0.377112,
126,Bahrain,Middle East and North Africa,2014.0,6.165134,10.801981,,65.959999,,,,,,
127,Bahrain,Middle East and North Africa,2015.0,6.007375,10.788364,0.852551,66.199997,0.849521,0.1075,,0.653345,0.302972,
128,Bahrain,Middle East and North Africa,2016.0,6.169673,10.789037,0.8627,66.125,0.888691,0.083302,,0.73622,0.283466,
129,Bahrain,Middle East and North Africa,2017.0,6.227321,10.798135,0.875747,66.050003,0.905859,0.130106,,0.754333,0.28976,
130,Bahrain,Middle East and North Africa,2019.0,7.098012,10.815147,0.877929,65.900002,0.906536,0.036391,,0.711386,0.317106,
131,Bahrain,Middle East and North Africa,2020.0,6.173176,10.775782,0.847745,65.824997,0.945233,0.117025,,0.72951,0.296835,
305,Cambodia,Southeast Asia,2018.0,5.121838,8.347027,0.794605,61.299999,0.958305,0.032343,,0.723344,0.414346,
387,China,East Asia,2012.0,5.094917,9.246742,0.787818,67.220001,0.808255,-0.187489,,0.68916,0.158703,


In [37]:
# count of countries missing both columns

missing_both['Country Name'].value_counts()

Saudi Arabia            11
United Arab Emirates    11
China                   10
Jordan                   9
Turkmenistan             8
Kuwait                   7
Bahrain                  6
Egypt                    5
Vietnam                  3
Algeria                  2
Libya                    2
Qatar                    2
Cambodia                 1
Ethiopia                 1
Laos                     1
Maldives                 1
Name: Country Name, dtype: int64

Authoritarian states missing values in 'Confidence In National Government' and 'Perceptions of Corruption' columns the most. I'll keep the missing values because most columns with NaNs comprise less than 5 percent of the data. In addition, for countries that aren't reporting certain metrics for likely political reasons, I want to make note of them in my analysis or visually depict these states separately. A separate analysis of un-reported fields could yield some interesting insights as well.

## Duplicate Values

In [24]:
# checking for duplicate values

df.duplicated().value_counts()

False    1487
dtype: int64

No duplicate values

# 3. Consistency Checks

## Count of Countries

In [25]:
df['Country Name'].nunique()

161

In [26]:
df['Country Name'].value_counts()

Zimbabwe                     11
Spain                        11
Romania                      11
Italy                        11
Israel                       11
Ireland                      11
Russia                       11
Iran                         11
Indonesia                    11
India                        11
Saudi Arabia                 11
Hungary                      11
Senegal                      11
Guinea                       11
Slovenia                     11
Greece                       11
Ghana                        11
Germany                      11
Georgia                      11
France                       11
Finland                      11
Portugal                     11
Japan                        11
Jordan                       11
North Macedonia              11
Mongolia                     11
Moldova                      11
Mexico                       11
Netherlands                  11
New Zealand                  11
Malta                        11
Mali    

In [45]:
df['Country Name'].value_counts().sum()

1487

Countries like South Sudan and Somaliland region were either formed recently or their sovereignty is up for debate.

In [27]:
# calculating percentage of missing years for each country

(11- df['Country Name'].value_counts())/len(df)*100

Zimbabwe                     0.000000
Spain                        0.000000
Romania                      0.000000
Italy                        0.000000
Israel                       0.000000
Ireland                      0.000000
Russia                       0.000000
Iran                         0.000000
Indonesia                    0.000000
India                        0.000000
Saudi Arabia                 0.000000
Hungary                      0.000000
Senegal                      0.000000
Guinea                       0.000000
Slovenia                     0.000000
Greece                       0.000000
Ghana                        0.000000
Germany                      0.000000
Georgia                      0.000000
France                       0.000000
Finland                      0.000000
Portugal                     0.000000
Japan                        0.000000
Jordan                       0.000000
North Macedonia              0.000000
Mongolia                     0.000000
Moldova     

Max number of occurences for country is 11. Min is 1. If I were to do a time-series analysis, I may have to drop countries with less than 11 occurences in the data

## Count of Years

In [28]:
Years = df['Year'].value_counts()

Years

2017    147
2014    144
2019    143
2015    142
2012    141
2016    141
2018    141
2013    136
2021    122
2020    116
2022    114
Name: Year, dtype: int64

In [29]:
# resetting index

Years = Years.reset_index()

In [31]:
# re-naming columns

Years.columns = ['Year', 'Count of Countries']

Years

Unnamed: 0,Year,Count of Countries
0,2017,147
1,2014,144
2,2019,143
3,2015,142
4,2012,141
5,2016,141
6,2018,141
7,2013,136
8,2021,122
9,2020,116


In [32]:
# sorting by year

Years.sort_values(by = 'Year')

Unnamed: 0,Year,Count of Countries
4,2012,141
7,2013,136
1,2014,144
3,2015,142
5,2016,141
0,2017,147
6,2018,141
2,2019,143
9,2020,116
8,2021,122


In [34]:
# percentage of countries accounted for each year in the data

countries_accounted = lambda x: (x['Count of Countries'] / 161) * 100  # 161 is the total number of unique countries in the data since 2012

Years['% of Countries Accounted'] = Years.apply(countries_accounted, axis = 1)

Years.sort_values(by = 'Year')

Unnamed: 0,Year,Count of Countries,% of Countries Accounted
4,2012,141,87.57764
7,2013,136,84.47205
1,2014,144,89.440994
3,2015,142,88.198758
5,2016,141,87.57764
0,2017,147,91.304348
6,2018,141,87.57764
2,2019,143,88.819876
9,2020,116,72.049689
8,2021,122,75.776398


Last three years shows smaller numbers of countries accounted for in the data. Since the World Happiness Report is mainly self-reported, this could be partly due to government resources being stretched thin because of the pandemic.

## Count of Regional Indicators

In [38]:
df['Regional Indicator'].value_counts()

Sub-Saharan Africa                    335
Western Europe                        214
Latin America and Caribbean           211
Central and Eastern Europe            181
Middle East and North Africa          158
Commonwealth of Independent States    121
Southeast Asia                         96
South Asia                             66
East Asia                              61
North America and ANZ                  44
Name: Regional Indicator, dtype: int64

## Exporting New Data

In [39]:
path = r'C:\Users\Charles Yi\A6\02 Data\Prepared'

df.to_pickle(os.path.join(path, 'df_2012.pkl'))