# 2020 Finland Data Exploration

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns
import datetime as dt

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for Finland and COVID-19 cases in Finland
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes - Finland

In [2]:
finland_hap = pd.read_csv('../data/raw_data/2020/finland.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with Finland dataset for happiness

In [3]:
finland_hap

Unnamed: 0,RecordNo,endtime,qweek,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,...,Soc2_6,Soc2_open,vac_1,vac_2,vac2_1,vac2_2,vac2_3,vac2_4,vac2_5,vac2_6
0,0,08/04/2020 14:14,week 1,2,2,0,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
1,1,08/04/2020 14:17,week 1,0,30,2,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
2,2,08/04/2020 14:16,week 1,1,0,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
3,3,08/04/2020 14:18,week 1,0,50,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
4,4,08/04/2020 14:25,week 1,0,3,2,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16058,16058,16/11/2020 22:45,week 16,1,3,2,"No, I have not","No, they have not",No,No,...,,__NA__,5 – Strongly disagree,3,3,1 - Strongly agree,2,3,3,3
16059,16059,16/11/2020 23:19,week 16,0,0,0,"No, I have not","No, they have not",No,No,...,,__NA__,3,3,2,2,4,3,4,3
16060,16060,17/11/2020 05:21,week 16,0,6,0,"No, I have not","No, they have not",No,No,...,,__NA__,3,3,1 - Strongly agree,1 - Strongly agree,3,3,3,3
16061,16061,17/11/2020 06:03,week 16,1,20,1,"No, I have not","No, they have not",No,No,...,,__NA__,2,1 - Strongly agree,2,2,1 - Strongly agree,2,5 – Strongly disagree,2


## Selecting and renaming columns relevant for this analysis

In [4]:
finland_hap_df = finland_hap[['RecordNo', 'endtime','region','qweek','i3_health','d1_health_12','age','gender','WCRex2','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','WCRex1','r1_1','r1_7','ox3_1','ox3_2','ox3_3','ox3_6','ox5_2','ox6','w2','w3','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek': 'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'WCRex2':'confidence_NHS',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'WCRex1':'gvt_rating',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_2':'social_expectations',
    'ox3_3':'appropriate_behaviors',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
finland_hap_df = finland_hap_df.rename(columns = col_dict)

In [6]:
finland_hap_df

Unnamed: 0,record_no,date,region,week,covid_test,mental_health_condition,age,gender,confidence_NHS,happiness_2wk_comparison,...,public_sector,community_belonging,belonging_change,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,0,08/04/2020 14:14,Länsi-Suomi,week 1,"No, I have not",No,55,Male,,,...,,,,,,,,,,
1,1,08/04/2020 14:17,Pohjois- ja Itä-Suomi,week 1,"No, I have not",No,63,Male,,,...,,,,,,,,,,
2,2,08/04/2020 14:16,Etelä-Suomi,week 1,"No, I have not",No,65,Female,,,...,,,,,,,,,,
3,3,08/04/2020 14:18,Länsi-Suomi,week 1,"No, I have not",No,29,Male,,,...,,,,,,,,,,
4,4,08/04/2020 14:25,Etelä-Suomi,week 1,"No, I have not",No,64,Male,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16058,16058,16/11/2020 22:45,Pohjois- ja Itä-Suomi,week 16,"No, I have not",No,72,Female,A fair amount of confidence,,...,,,,,,,,,,
16059,16059,16/11/2020 23:19,Pohjois- ja Itä-Suomi,week 16,"No, I have not",No,40,Male,Don't know,,...,,,,,,,,,,
16060,16060,17/11/2020 05:21,Pohjois- ja Itä-Suomi,week 16,"No, I have not",Yes,60,Female,A fair amount of confidence,,...,,,,,,,,,,
16061,16061,17/11/2020 06:03,Pohjois- ja Itä-Suomi,week 16,"No, I have not",No,61,Female,A lot of confidence,,...,,,,,,,,,,


## Filtering columns

In [7]:
cols_filter = ['date','week','region','mental_health_condition','age','gender','happiness_rating']

In [8]:
finland_hap_filter=finland_hap_df[cols_filter]
finland_hap_filter

Unnamed: 0,date,week,region,mental_health_condition,age,gender,happiness_rating
0,08/04/2020 14:14,week 1,Länsi-Suomi,No,55,Male,
1,08/04/2020 14:17,week 1,Pohjois- ja Itä-Suomi,No,63,Male,
2,08/04/2020 14:16,week 1,Etelä-Suomi,No,65,Female,
3,08/04/2020 14:18,week 1,Länsi-Suomi,No,29,Male,
4,08/04/2020 14:25,week 1,Etelä-Suomi,No,64,Male,
...,...,...,...,...,...,...,...
16058,16/11/2020 22:45,week 16,Pohjois- ja Itä-Suomi,No,72,Female,8
16059,16/11/2020 23:19,week 16,Pohjois- ja Itä-Suomi,No,40,Male,4
16060,17/11/2020 05:21,week 16,Pohjois- ja Itä-Suomi,Yes,60,Female,2
16061,17/11/2020 06:03,week 16,Pohjois- ja Itä-Suomi,No,61,Female,8


In [9]:
## No happinesss rating data available for week 1, drop rows
finland_hap_filter.drop(finland_hap_filter.loc[finland_hap_filter['week']=='week 1'].index, inplace=True)
finland_hap_filter

Unnamed: 0,date,week,region,mental_health_condition,age,gender,happiness_rating
1008,27/04/2020 15:34,week 2,Etelä-Suomi,Yes,40,Male,5
1009,27/04/2020 15:36,week 2,Länsi-Suomi,No,43,Male,7
1010,27/04/2020 15:37,week 2,Etelä-Suomi,No,20,Female,8
1011,27/04/2020 15:39,week 2,Etelä-Suomi,No,52,Male,0
1012,27/04/2020 15:41,week 2,Etelä-Suomi,No,47,Male,7
...,...,...,...,...,...,...,...
16058,16/11/2020 22:45,week 16,Pohjois- ja Itä-Suomi,No,72,Female,8
16059,16/11/2020 23:19,week 16,Pohjois- ja Itä-Suomi,No,40,Male,4
16060,17/11/2020 05:21,week 16,Pohjois- ja Itä-Suomi,Yes,60,Female,2
16061,17/11/2020 06:03,week 16,Pohjois- ja Itä-Suomi,No,61,Female,8


In [10]:
finland_hap_filter['date'] = pd.to_datetime(finland_hap_filter.date)
finland_hap_filter['date'] = finland_hap_filter['date'].dt.date

In [11]:
finland_hap_filter['happiness_rating'] = finland_hap_filter['happiness_rating'].astype(float)

In [12]:
finland_hap_filter

Unnamed: 0,date,week,region,mental_health_condition,age,gender,happiness_rating
1008,2020-04-27,week 2,Etelä-Suomi,Yes,40,Male,5.0
1009,2020-04-27,week 2,Länsi-Suomi,No,43,Male,7.0
1010,2020-04-27,week 2,Etelä-Suomi,No,20,Female,8.0
1011,2020-04-27,week 2,Etelä-Suomi,No,52,Male,0.0
1012,2020-04-27,week 2,Etelä-Suomi,No,47,Male,7.0
...,...,...,...,...,...,...,...
16058,2020-11-16,week 16,Pohjois- ja Itä-Suomi,No,72,Female,8.0
16059,2020-11-16,week 16,Pohjois- ja Itä-Suomi,No,40,Male,4.0
16060,2020-11-17,week 16,Pohjois- ja Itä-Suomi,Yes,60,Female,2.0
16061,2020-11-17,week 16,Pohjois- ja Itä-Suomi,No,61,Female,8.0


## Exported to clean csv titled finland_cleaned.csv

In [13]:
# Making clean csv with these changes
finland_hap_filter.to_csv('../data/cleaned_data/2020/finland_hap_cleaned.csv')

## Now looking at COVID-19 cases data- Finland

In [14]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to Finland

In [15]:
finland_cov = covid_global.loc[covid_global['Country/Region']=='Finland']
finland_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
117,,Finland,61.92411,25.748151,0,0,0,0,0,0,...,22289,22652,23148,23766,24307,24629,24912,25462,25882,26422


In [16]:
date_cols = finland_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [17]:
finland_cov = finland_cov.melt(id_vars = finland_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')

In [18]:
finland_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,,Finland,61.92411,25.748151,1/22/20,0
1,,Finland,61.92411,25.748151,1/23/20,0
2,,Finland,61.92411,25.748151,1/24/20,0
3,,Finland,61.92411,25.748151,1/25/20,0
4,,Finland,61.92411,25.748151,1/26/20,0
...,...,...,...,...,...,...
312,,Finland,61.92411,25.748151,11/29/20,24629
313,,Finland,61.92411,25.748151,11/30/20,24912
314,,Finland,61.92411,25.748151,12/1/20,25462
315,,Finland,61.92411,25.748151,12/2/20,25882


In [19]:
finland_cov['date']=pd.to_datetime(finland_cov['date'])

## Exported to clean csv titled finland_covid.csv

In [20]:
# Making clean csv with these changes
finland_cov.to_csv('../data/cleaned_data/2020/finland_covid.csv')