In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
hp20 = pd.read_csv('data/2020.csv')
hp19 = pd.read_csv('data/2019.csv')

In [3]:
hp20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                153 non-null    object 
 1   Regional indicator                          153 non-null    object 
 2   Ladder score                                153 non-null    float64
 3   Standard error of ladder score              153 non-null    float64
 4   upperwhisker                                153 non-null    float64
 5   lowerwhisker                                153 non-null    float64
 6   Logged GDP per capita                       153 non-null    float64
 7   Social support                              153 non-null    float64
 8   Healthy life expectancy                     153 non-null    float64
 9   Freedom to make life choices                153 non-null    float64
 10  Generosity    

In [4]:
hp19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


### 사용할 속성
|20|19|
|---|---|
|Country name|Country or region|
|Regional indicator|.|
|Ladder score|Score|
|Logged GDP per capita|GDP per capita|
|Social support|Social support|
|Healthy life expectancy|Healthy life expectancy|
|Freedom to make life choices|Freedom to make life choices|
|Generosity|Generosity|
|Perceptions of corruption|Perceptions of corruption|

In [5]:
# 20년도의 대륙 데이터를 19년도에 이용해보기
print('20년도 나라 수 :', len(hp20['Country name'].unique()))
print('19년도 나라 수 :', len(hp19['Country or region'].unique()))

20년도 나라 수 : 153
19년도 나라 수 : 156


In [6]:
match_continent = {}
for name in hp19['Country or region']:
    tmp = hp20[hp20['Country name']==name]['Regional indicator'].tolist()
    if len(tmp):
        match_continent[name] = tmp[0]
match_continent

{'Finland': 'Western Europe',
 'Denmark': 'Western Europe',
 'Norway': 'Western Europe',
 'Iceland': 'Western Europe',
 'Netherlands': 'Western Europe',
 'Switzerland': 'Western Europe',
 'Sweden': 'Western Europe',
 'New Zealand': 'North America and ANZ',
 'Canada': 'North America and ANZ',
 'Austria': 'Western Europe',
 'Australia': 'North America and ANZ',
 'Costa Rica': 'Latin America and Caribbean',
 'Israel': 'Middle East and North Africa',
 'Luxembourg': 'Western Europe',
 'United Kingdom': 'Western Europe',
 'Ireland': 'Western Europe',
 'Germany': 'Western Europe',
 'Belgium': 'Western Europe',
 'United States': 'North America and ANZ',
 'Czech Republic': 'Central and Eastern Europe',
 'United Arab Emirates': 'Middle East and North Africa',
 'Malta': 'Western Europe',
 'Mexico': 'Latin America and Caribbean',
 'France': 'Western Europe',
 'Chile': 'Latin America and Caribbean',
 'Guatemala': 'Latin America and Caribbean',
 'Saudi Arabia': 'Middle East and North Africa',
 'Spai

In [7]:
match_df = pd.DataFrame.from_dict(match_continent, orient='index', columns=['Regional indicator'])
match_df = match_df.reset_index().rename(columns={'index':'Country or region'})

In [8]:
match_df['Country or region'][:5]

0        Finland
1        Denmark
2         Norway
3        Iceland
4    Netherlands
Name: Country or region, dtype: object

In [9]:
merged19 = pd.merge(hp19, match_df, on=['Country or region'], how='left')
merged19

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393,Western Europe
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410,Western Europe
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Western Europe
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118,Western Europe
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,Western Europe
...,...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411,Sub-Saharan Africa
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147,Sub-Saharan Africa
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025,South Asia
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035,Sub-Saharan Africa


In [10]:
merged19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 0 to 155
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
 9   Regional indicator            147 non-null    object 
dtypes: float64(7), int64(1), object(2)
memory usage: 13.4+ KB


결측값!

In [11]:
merged19[merged19['Regional indicator'].isnull()]['Country or region']

24                Taiwan
28                 Qatar
38     Trinidad & Tobago
63       Northern Cyprus
75             Hong Kong
83       North Macedonia
94                Bhutan
111              Somalia
148                Syria
Name: Country or region, dtype: object

In [12]:
merged19['Regional indicator'].unique()

array(['Western Europe', 'North America and ANZ',
       'Latin America and Caribbean', 'Middle East and North Africa',
       'Central and Eastern Europe', nan, 'Southeast Asia',
       'Commonwealth of Independent States', 'East Asia',
       'Sub-Saharan Africa', 'South Asia'], dtype=object)

이렇게 해도 됨
```python
merged19['Regional indicator'].fillna('Others')
```
별로 없어서 직접 추가

In [13]:
# 주석은 위 Regional indecator에 존재하지 않았던 곳
merged19.loc[merged19['Country or region']=='Taiwan', 'Regional indicator'] = "East Asia"
#merged19.loc[merged19['Country or region']=='Qatar', 'Regional indicator'] = "West Asia"
merged19.loc[merged19['Country or region']=='Trinidad & Tobago', 'Regional indicator'] = "North America and ANZ"
#merged19.loc[merged19['Country or region']=='Northern Cyprus', 'Regional indicator'] = "West Asia"
merged19.loc[merged19['Country or region']=='Hong Kong', 'Regional indicator'] = "East Asia"
#merged19.loc[merged19['Country or region']=='North Macedonia', 'Regional indicator'] = "Southen Europe"
merged19.loc[merged19['Country or region']=='Bhutan', 'Regional indicator'] = "Southeast Asia"
merged19.loc[merged19['Country or region']=='Somalia', 'Regional indicator'] = "Sub-Saharan Africa"
#merged19.loc[merged19['Country or region']=='Syria', 'Regional indicator'] = "West Asia"
# 나머지는 others
merged19['Regional indicator'] = merged19['Regional indicator'].fillna('Others')

In [14]:
merged19['Regional indicator'].isnull().sum()

0

In [15]:
data19 = merged19.drop(['Overall rank'], axis=1)
data19['year'] = 2019
data19.head()

Unnamed: 0,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator,year
0,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,Western Europe,2019
1,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,Western Europe,2019
2,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Western Europe,2019
3,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,Western Europe,2019
4,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,Western Europe,2019


In [16]:
# copy를 쓰지 않으면 경고 발생 : SettingWithCopyWarning
data20 = hp20[['Country name', 'Regional indicator', 'Ladder score', 'Logged GDP per capita', 
               'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 
               'Generosity', 'Perceptions of corruption']].copy()

data20.columns=['Country or region', 'Regional indicator', 'Score', 'GDP per capita', 'Social support',
                'Healthy life expectancy', 'Freedom to make life choices','Generosity', 
                'Perceptions of corruption']
data20['year'] = 2020
data20.head()

Unnamed: 0,Country or region,Regional indicator,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,year
0,Finland,Western Europe,7.8087,10.639267,0.95433,71.900825,0.949172,-0.059482,0.195445,2020
1,Denmark,Western Europe,7.6456,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,2020
2,Switzerland,Western Europe,7.5599,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,2020
3,Iceland,Western Europe,7.5045,10.772559,0.97467,73.0,0.948892,0.246944,0.71171,2020
4,Norway,Western Europe,7.488,11.087804,0.952487,73.200783,0.95575,0.134533,0.263218,2020


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html 참고

In [17]:
data = pd.concat([data19, data20])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 309 entries, 0 to 152
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country or region             309 non-null    object 
 1   Score                         309 non-null    float64
 2   GDP per capita                309 non-null    float64
 3   Social support                309 non-null    float64
 4   Healthy life expectancy       309 non-null    float64
 5   Freedom to make life choices  309 non-null    float64
 6   Generosity                    309 non-null    float64
 7   Perceptions of corruption     309 non-null    float64
 8   Regional indicator            309 non-null    object 
 9   year                          309 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 26.6+ KB


In [18]:
hp18 = pd.read_csv('data/2018.csv')
hp18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [19]:
def add_cont(df, col):
    global data
    match_continent = {}
    for name in df[col]:
        tmp = data[data['Country or region']==name]['Regional indicator'].tolist()
        if len(tmp):
            match_continent[name] = tmp[0]
    
    match_df = pd.DataFrame.from_dict(match_continent, orient='index', columns=['Regional indicator'])
    match_df = match_df.reset_index().rename(columns={'index':col})
    merged_df = pd.merge(df, match_df, on=[col], how='left')
    return merged_df

In [20]:
data18 = add_cont(hp18, 'Country or region')
data18.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,Western Europe
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34,Western Europe
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408,Western Europe
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,Western Europe
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357,Western Europe


In [21]:
data18[data18['Regional indicator'].isnull()]['Country or region']

48     Belize
136     Sudan
141    Angola
Name: Country or region, dtype: object

In [22]:
data18.loc[data18['Country or region']=='Sudan', 'Regional indicator'] = "Middle East and North Africa"
data18.loc[data18['Country or region']=='Angola', 'Regional indicator'] = "Sub-Saharan Africa"
data18['Regional indicator'] = data18['Regional indicator'].fillna('Others')
data18.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       1
Regional indicator              0
dtype: int64

In [23]:
data18 = data18.drop(['Overall rank'], axis=1)
data18['year'] = 2018
data18.head()

Unnamed: 0,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator,year
0,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,Western Europe,2018
1,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34,Western Europe,2018
2,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408,Western Europe,2018
3,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,Western Europe,2018
4,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357,Western Europe,2018


In [24]:
data = pd.concat([data, data18])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 465 entries, 0 to 155
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country or region             465 non-null    object 
 1   Score                         465 non-null    float64
 2   GDP per capita                465 non-null    float64
 3   Social support                465 non-null    float64
 4   Healthy life expectancy       465 non-null    float64
 5   Freedom to make life choices  465 non-null    float64
 6   Generosity                    465 non-null    float64
 7   Perceptions of corruption     464 non-null    float64
 8   Regional indicator            465 non-null    object 
 9   year                          465 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 40.0+ KB


In [25]:
data.groupby('year')['Score'].mean()

year
2018    5.375917
2019    5.407096
2020    5.473240
Name: Score, dtype: float64

In [26]:
hp17 = pd.read_csv('data/2017.csv')
hp17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [27]:
data17 = add_cont(hp17, 'Country')
data17.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual,Regional indicator
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,Western Europe
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,Western Europe
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,Western Europe
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,Western Europe
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,Western Europe


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 465 entries, 0 to 155
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country or region             465 non-null    object 
 1   Score                         465 non-null    float64
 2   GDP per capita                465 non-null    float64
 3   Social support                465 non-null    float64
 4   Healthy life expectancy       465 non-null    float64
 5   Freedom to make life choices  465 non-null    float64
 6   Generosity                    465 non-null    float64
 7   Perceptions of corruption     464 non-null    float64
 8   Regional indicator            465 non-null    object 
 9   year                          465 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 40.0+ KB


In [29]:
data17.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155 entries, 0 to 154
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
 12  Regional indicator             154 non-null    obj

In [30]:
data17 = data17.drop(['Happiness.Rank', 'Whisker.high', 'Whisker.low', 'Dystopia.Residual'], axis=1)
data17.columns=['Country or region', 'Score', 'GDP per capita', 'Social support',
                'Healthy life expectancy', 'Freedom to make life choices','Generosity', 
                'Perceptions of corruption', 'Regional indicator']
data17.head()

Unnamed: 0,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator
0,Norway,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,Western Europe
1,Denmark,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,Western Europe
2,Iceland,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,Western Europe
3,Switzerland,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,Western Europe
4,Finland,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,Western Europe


In [31]:
data17[data17['Regional indicator'].isnull()]['Country or region']

70    Hong Kong S.A.R., China
Name: Country or region, dtype: object

In [32]:
data17.loc[70, 'Country or region'] = "Hong Kong"
data17.loc[70, 'Regional indicator'] = "East Asia"
data17['Regional indicator'].isnull().sum()

0

In [33]:
data17['year'] = 2017
data = pd.concat([data, data17])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 620 entries, 0 to 154
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country or region             620 non-null    object 
 1   Score                         620 non-null    float64
 2   GDP per capita                620 non-null    float64
 3   Social support                620 non-null    float64
 4   Healthy life expectancy       620 non-null    float64
 5   Freedom to make life choices  620 non-null    float64
 6   Generosity                    620 non-null    float64
 7   Perceptions of corruption     619 non-null    float64
 8   Regional indicator            620 non-null    object 
 9   year                          620 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 53.3+ KB


In [34]:
hp16 = pd.read_csv('data/2016.csv')
hp16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

### 2016 파일의 컬럼의 이름이 직관적 -> 데이터파일 컬럼이름에 적용

In [35]:
data.columns=['Country', 'Score', 'Economy(GDP_per_Cap)', 'Family', 'Health(Life_Expectancy)',
              'Freedom', 'Generosity', 'Trust(Government_Corruption)', 'Region', 'Year']
data.head()

Unnamed: 0,Country,Score,Economy(GDP_per_Cap),Family,Health(Life_Expectancy),Freedom,Generosity,Trust(Government_Corruption),Region,Year
0,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,Western Europe,2019
1,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,Western Europe,2019
2,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Western Europe,2019
3,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,Western Europe,2019
4,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,Western Europe,2019


In [36]:
data16 = hp16.drop(['Happiness Rank', 'Lower Confidence Interval', 'Upper Confidence Interval',
                    'Dystopia Residual'], axis=1)
data16.head()

Unnamed: 0,Country,Region,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,Denmark,Western Europe,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171
1,Switzerland,Western Europe,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083
2,Iceland,Western Europe,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678
3,Norway,Western Europe,7.498,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895
4,Finland,Western Europe,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492


In [37]:
data16 = data16.rename(columns={'Happiness Score':'Score', 
                                'Economy (GDP per Capita)':'Economy(GDP_per_Cap)',
                                'Health (Life Expectancy)':'Health(Life_Expectancy)',
                                'Trust (Government Corruption)':'Trust(Government_Corruption)'})
data16.head()

Unnamed: 0,Country,Region,Score,Economy(GDP_per_Cap),Family,Health(Life_Expectancy),Freedom,Trust(Government_Corruption),Generosity
0,Denmark,Western Europe,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171
1,Switzerland,Western Europe,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083
2,Iceland,Western Europe,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678
3,Norway,Western Europe,7.498,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895
4,Finland,Western Europe,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492


In [38]:
data16['Year'] = 2016
data = pd.concat([data, data16])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 777 entries, 0 to 156
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       777 non-null    object 
 1   Score                         777 non-null    float64
 2   Economy(GDP_per_Cap)          777 non-null    float64
 3   Family                        777 non-null    float64
 4   Health(Life_Expectancy)       777 non-null    float64
 5   Freedom                       777 non-null    float64
 6   Generosity                    777 non-null    float64
 7   Trust(Government_Corruption)  776 non-null    float64
 8   Region                        777 non-null    object 
 9   Year                          777 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 66.8+ KB


In [39]:
hp15 = pd.read_csv('data/2015.csv')
hp15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [40]:
data15 = hp15.drop(['Happiness Rank', 'Standard Error', 'Dystopia Residual'], axis=1)
data15 = data15.rename(columns={'Happiness Score':'Score', 
                                'Economy (GDP per Capita)':'Economy(GDP_per_Cap)',
                                'Health (Life Expectancy)':'Health(Life_Expectancy)',
                                'Trust (Government Corruption)':'Trust(Government_Corruption)'})
data15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       158 non-null    object 
 1   Region                        158 non-null    object 
 2   Score                         158 non-null    float64
 3   Economy(GDP_per_Cap)          158 non-null    float64
 4   Family                        158 non-null    float64
 5   Health(Life_Expectancy)       158 non-null    float64
 6   Freedom                       158 non-null    float64
 7   Trust(Government_Corruption)  158 non-null    float64
 8   Generosity                    158 non-null    float64
dtypes: float64(7), object(2)
memory usage: 11.2+ KB


In [41]:
data15['Year'] = 2015
data = pd.concat([data, data15])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 935 entries, 0 to 157
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       935 non-null    object 
 1   Score                         935 non-null    float64
 2   Economy(GDP_per_Cap)          935 non-null    float64
 3   Family                        935 non-null    float64
 4   Health(Life_Expectancy)       935 non-null    float64
 5   Freedom                       935 non-null    float64
 6   Generosity                    935 non-null    float64
 7   Trust(Government_Corruption)  934 non-null    float64
 8   Region                        935 non-null    object 
 9   Year                          935 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 80.4+ KB


In [42]:
data[data['Trust(Government_Corruption)'].isnull()]['Country']

19    United Arab Emirates
Name: Country, dtype: object

In [43]:
mean_uae_trust = data[data['Country']=='United Arab Emirates']['Trust(Government_Corruption)'].mean()
data.loc[19,'Trust(Government_Corruption)'] = mean_uae_trust
data.isnull().sum()

Country                         0
Score                           0
Economy(GDP_per_Cap)            0
Family                          0
Health(Life_Expectancy)         0
Freedom                         0
Generosity                      0
Trust(Government_Corruption)    0
Region                          0
Year                            0
dtype: int64

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 935 entries, 0 to 157
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       935 non-null    object 
 1   Score                         935 non-null    float64
 2   Economy(GDP_per_Cap)          935 non-null    float64
 3   Family                        935 non-null    float64
 4   Health(Life_Expectancy)       935 non-null    float64
 5   Freedom                       935 non-null    float64
 6   Generosity                    935 non-null    float64
 7   Trust(Government_Corruption)  935 non-null    float64
 8   Region                        935 non-null    object 
 9   Year                          935 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 96.5+ KB


In [45]:
hp21 = pd.read_csv('data/world-happiness-report-2021.csv')
hp21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                149 non-null    object 
 1   Regional indicator                          149 non-null    object 
 2   Ladder score                                149 non-null    float64
 3   Standard error of ladder score              149 non-null    float64
 4   upperwhisker                                149 non-null    float64
 5   lowerwhisker                                149 non-null    float64
 6   Logged GDP per capita                       149 non-null    float64
 7   Social support                              149 non-null    float64
 8   Healthy life expectancy                     149 non-null    float64
 9   Freedom to make life choices                149 non-null    float64
 10  Generosity    

In [46]:
data21 = hp21[['Country name', 'Ladder score', 'Logged GDP per capita', 'Social support', 
               'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 
               'Perceptions of corruption', 'Regional indicator']].copy()

data21.columns=['Country', 'Score', 'Economy(GDP_per_Cap)', 'Family', 'Health(Life_Expectancy)',
              'Freedom', 'Generosity', 'Trust(Government_Corruption)', 'Region']
data21.head()

Unnamed: 0,Country,Score,Economy(GDP_per_Cap),Family,Health(Life_Expectancy),Freedom,Generosity,Trust(Government_Corruption),Region
0,Finland,7.842,10.775,0.954,72.0,0.949,-0.098,0.186,Western Europe
1,Denmark,7.62,10.933,0.954,72.7,0.946,0.03,0.179,Western Europe
2,Switzerland,7.571,11.117,0.942,74.4,0.919,0.025,0.292,Western Europe
3,Iceland,7.554,10.878,0.983,73.0,0.955,0.16,0.673,Western Europe
4,Netherlands,7.464,10.932,0.942,72.4,0.913,0.175,0.338,Western Europe


In [47]:
data21['Year'] = 2021
data = pd.concat([data, data21])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1084 entries, 0 to 148
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       1084 non-null   object 
 1   Score                         1084 non-null   float64
 2   Economy(GDP_per_Cap)          1084 non-null   float64
 3   Family                        1084 non-null   float64
 4   Health(Life_Expectancy)       1084 non-null   float64
 5   Freedom                       1084 non-null   float64
 6   Generosity                    1084 non-null   float64
 7   Trust(Government_Corruption)  1084 non-null   float64
 8   Region                        1084 non-null   object 
 9   Year                          1084 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 93.2+ KB


In [48]:
data = data[['Year', 'Region', 'Country', 'Score', 'Economy(GDP_per_Cap)', 'Family', 
             'Health(Life_Expectancy)', 'Freedom', 'Generosity', 'Trust(Government_Corruption)']]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1084 entries, 0 to 148
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Year                          1084 non-null   int64  
 1   Region                        1084 non-null   object 
 2   Country                       1084 non-null   object 
 3   Score                         1084 non-null   float64
 4   Economy(GDP_per_Cap)          1084 non-null   float64
 5   Family                        1084 non-null   float64
 6   Health(Life_Expectancy)       1084 non-null   float64
 7   Freedom                       1084 non-null   float64
 8   Generosity                    1084 non-null   float64
 9   Trust(Government_Corruption)  1084 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 93.2+ KB


In [49]:
data.to_csv('happiness.csv', index=False)

In [50]:
data_check = pd.read_csv('happiness.csv')
data_check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084 entries, 0 to 1083
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Year                          1084 non-null   int64  
 1   Region                        1084 non-null   object 
 2   Country                       1084 non-null   object 
 3   Score                         1084 non-null   float64
 4   Economy(GDP_per_Cap)          1084 non-null   float64
 5   Family                        1084 non-null   float64
 6   Health(Life_Expectancy)       1084 non-null   float64
 7   Freedom                       1084 non-null   float64
 8   Generosity                    1084 non-null   float64
 9   Trust(Government_Corruption)  1084 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 84.8+ KB
