In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
hp20 = pd.read_csv('data/2020.csv')
hp19 = pd.read_csv('data/2019.csv')

In [3]:
hp20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                153 non-null    object 
 1   Regional indicator                          153 non-null    object 
 2   Ladder score                                153 non-null    float64
 3   Standard error of ladder score              153 non-null    float64
 4   upperwhisker                                153 non-null    float64
 5   lowerwhisker                                153 non-null    float64
 6   Logged GDP per capita                       153 non-null    float64
 7   Social support                              153 non-null    float64
 8   Healthy life expectancy                     153 non-null    float64
 9   Freedom to make life choices                153 non-null    float64
 10  Generosity    

In [4]:
hp19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


### 사용할 속성
|20|19|
|---|---|
|Country name|Country or region|
|Regional indicator|.|
|Ladder score|Score|
|Logged GDP per capita|GDP per capita|
|Social support|Social support|
|Healthy life expectancy|Healthy life expectancy|
|Freedom to make life choices|Freedom to make life choices|
|Generosity|Generosity|
|Perceptions of corruption|Perceptions of corruption|

In [5]:
# 20년도의 대륙 데이터를 19년도에 이용해보기
print('20년도 나라 수 :', len(hp20['Country name'].unique()))
print('19년도 나라 수 :', len(hp19['Country or region'].unique()))

20년도 나라 수 : 153
19년도 나라 수 : 156


In [6]:
match_continent = {}
for name in hp19['Country or region']:
    tmp = hp20[hp20['Country name']==name]['Regional indicator'].tolist()
    if len(tmp):
        match_continent[name] = tmp[0]
match_continent

{'Finland': 'Western Europe',
 'Denmark': 'Western Europe',
 'Norway': 'Western Europe',
 'Iceland': 'Western Europe',
 'Netherlands': 'Western Europe',
 'Switzerland': 'Western Europe',
 'Sweden': 'Western Europe',
 'New Zealand': 'North America and ANZ',
 'Canada': 'North America and ANZ',
 'Austria': 'Western Europe',
 'Australia': 'North America and ANZ',
 'Costa Rica': 'Latin America and Caribbean',
 'Israel': 'Middle East and North Africa',
 'Luxembourg': 'Western Europe',
 'United Kingdom': 'Western Europe',
 'Ireland': 'Western Europe',
 'Germany': 'Western Europe',
 'Belgium': 'Western Europe',
 'United States': 'North America and ANZ',
 'Czech Republic': 'Central and Eastern Europe',
 'United Arab Emirates': 'Middle East and North Africa',
 'Malta': 'Western Europe',
 'Mexico': 'Latin America and Caribbean',
 'France': 'Western Europe',
 'Chile': 'Latin America and Caribbean',
 'Guatemala': 'Latin America and Caribbean',
 'Saudi Arabia': 'Middle East and North Africa',
 'Spai

In [7]:
match_df = pd.DataFrame.from_dict(match_continent, orient='index', columns=['Regional indicator'])
match_df = match_df.reset_index().rename(columns={'index':'Country or region'})

In [8]:
match_df['Country or region'][:5]

0        Finland
1        Denmark
2         Norway
3        Iceland
4    Netherlands
Name: Country or region, dtype: object

In [9]:
merged19 = pd.merge(hp19, match_df, on=['Country or region'], how='left')
merged19

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393,Western Europe
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410,Western Europe
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Western Europe
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118,Western Europe
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,Western Europe
...,...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411,Sub-Saharan Africa
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147,Sub-Saharan Africa
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025,South Asia
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035,Sub-Saharan Africa


In [10]:
merged19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 0 to 155
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
 9   Regional indicator            147 non-null    object 
dtypes: float64(7), int64(1), object(2)
memory usage: 13.4+ KB


결측값!

In [11]:
merged19[merged19['Regional indicator'].isnull()]['Country or region']

24                Taiwan
28                 Qatar
38     Trinidad & Tobago
63       Northern Cyprus
75             Hong Kong
83       North Macedonia
94                Bhutan
111              Somalia
148                Syria
Name: Country or region, dtype: object

In [12]:
merged19['Regional indicator'].unique()

array(['Western Europe', 'North America and ANZ',
       'Latin America and Caribbean', 'Middle East and North Africa',
       'Central and Eastern Europe', nan, 'Southeast Asia',
       'Commonwealth of Independent States', 'East Asia',
       'Sub-Saharan Africa', 'South Asia'], dtype=object)

이렇게 해도 됨
```python
merged19['Country or region'].fillna('others')
```
별로 없어서 직접 추가

In [13]:
merged19.loc[merged19['Country or region']=='Taiwan', 'Regional indicator'] = "East Asia"
merged19.loc[merged19['Country or region']=='Qatar', 'Regional indicator'] = "West Asia"
merged19.loc[merged19['Country or region']=='Trinidad & Tobago', 'Regional indicator'] = "North America and ANZ"
merged19.loc[merged19['Country or region']=='Northern Cyprus', 'Regional indicator'] = "West Asia"
merged19.loc[merged19['Country or region']=='Hong Kong', 'Regional indicator'] = "East Asia"
merged19.loc[merged19['Country or region']=='North Macedonia', 'Regional indicator'] = "Southen Europe"
merged19.loc[merged19['Country or region']=='Bhutan', 'Regional indicator'] = "Southeast Asia"
merged19.loc[merged19['Country or region']=='Somalia', 'Regional indicator'] = "East Africa"
merged19.loc[merged19['Country or region']=='Syria', 'Regional indicator'] = "West Asia"

In [14]:
merged19['Regional indicator'].isnull().sum()

0

In [15]:
data19 = merged19.drop(['Overall rank'], axis=1)
data19['year'] = 2019
data19.head()

Unnamed: 0,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator,year
0,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,Western Europe,2019
1,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,Western Europe,2019
2,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,Western Europe,2019
3,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,Western Europe,2019
4,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,Western Europe,2019


In [16]:
# copy를 쓰지 않으면 경고 발생 : SettingWithCopyWarning
data20 = hp20[['Country name', 'Regional indicator', 'Ladder score', 'Logged GDP per capita', 
               'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 
               'Generosity', 'Perceptions of corruption']].copy()

data20.columns=['Country or region', 'Regional indicator', 'Score', 'GDP per capita', 'Social support',
                'Healthy life expectancy', 'Freedom to make life choices','Generosity', 
                'Perceptions of corruption']
data20['year'] = 2020
data20.head()

Unnamed: 0,Country or region,Regional indicator,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,year
0,Finland,Western Europe,7.8087,10.639267,0.95433,71.900825,0.949172,-0.059482,0.195445,2020
1,Denmark,Western Europe,7.6456,10.774001,0.955991,72.402504,0.951444,0.066202,0.168489,2020
2,Switzerland,Western Europe,7.5599,10.979933,0.942847,74.102448,0.921337,0.105911,0.303728,2020
3,Iceland,Western Europe,7.5045,10.772559,0.97467,73.0,0.948892,0.246944,0.71171,2020
4,Norway,Western Europe,7.488,11.087804,0.952487,73.200783,0.95575,0.134533,0.263218,2020


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html 참고

In [17]:
data = pd.concat([data19, data20])
data

Unnamed: 0,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Regional indicator,year
0,Finland,7.7690,1.340000,1.587000,0.986000,0.596000,0.153000,0.393000,Western Europe,2019
1,Denmark,7.6000,1.383000,1.573000,0.996000,0.592000,0.252000,0.410000,Western Europe,2019
2,Norway,7.5540,1.488000,1.582000,1.028000,0.603000,0.271000,0.341000,Western Europe,2019
3,Iceland,7.4940,1.380000,1.624000,1.026000,0.591000,0.354000,0.118000,Western Europe,2019
4,Netherlands,7.4880,1.396000,1.522000,0.999000,0.557000,0.322000,0.298000,Western Europe,2019
...,...,...,...,...,...,...,...,...,...,...
148,Central African Republic,3.4759,6.625160,0.319460,45.200001,0.640881,0.082410,0.891807,Sub-Saharan Africa,2020
149,Rwanda,3.3123,7.600104,0.540835,61.098846,0.900589,0.055484,0.183541,Sub-Saharan Africa,2020
150,Zimbabwe,3.2992,7.865712,0.763093,55.617260,0.711458,-0.072064,0.810237,Sub-Saharan Africa,2020
151,South Sudan,2.8166,7.425360,0.553707,51.000000,0.451314,0.016519,0.763417,Sub-Saharan Africa,2020
