# 2020 Denmark Data Exploration and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for Denmark and COVID-19 cases in Denmark
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes

In [2]:
denmark_hap = pd.read_csv('../data/raw_data/2020/denmark.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with Denmark dataset for happiness ratings

In [3]:
denmark_hap

Unnamed: 0,RecordNo,endtime,qweek,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,...,Soc2_6,Soc2_open,vac_1,vac_2,vac2_1,vac2_2,vac2_3,vac2_4,vac2_5,vac2_6
0,0,09/04/2020 10:42,week 1,0,3,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
1,1,09/04/2020 10:44,week 1,3,3,4,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
2,2,09/04/2020 10:44,week 1,2,2,3,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
3,3,09/04/2020 10:45,week 1,1,0,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
4,4,09/04/2020 10:46,week 1,5,5,5,"Yes, and I have not received my results from t...",,No,No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16048,16048,16/11/2020 15:48,week 16,2,5,4,"No, I have not","Yes, and they tested negative",No,No,...,,__NA__,2,1 - Strongly agree,4,2,1 - Strongly agree,3,3,2
16049,16049,16/11/2020 15:50,week 16,1,9,1,"No, I have not","No, they have not",No,No,...,No,__NA__,2,1 - Strongly agree,2,4,2,3,2,1 - Strongly agree
16050,16050,16/11/2020 15:51,week 16,0,4,1,"No, I have not","No, they have not",No,No,...,,__NA__,1 - Strongly agree,1 - Strongly agree,4,3,4,1 - Strongly agree,1 - Strongly agree,1 - Strongly agree
16051,16051,16/11/2020 15:52,week 16,2,50,2,"No, I have not","Yes, and they tested negative",No,No,...,,__NA__,4,3,5 – Strongly disagree,3,2,2,3,5 – Strongly disagree


## Selecting and renaming columns relevant for this data analysis

In [4]:
denmark_hap=denmark_hap[['RecordNo', 'endtime','region','qweek','i3_health','d1_health_12','age','gender','WCRex2','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','WCRex1','r1_1','r1_7','ox3_1','ox3_2','ox3_3','ox3_6','ox5_2','ox6','w2','w3','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek':'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'WCRex2':'confidence_NHS',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'WCRex1':'gvt_rating',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_2':'social_expectations',
    'ox3_3':'appropriate_behaviors',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
denmark_hap = denmark_hap.rename(columns = col_dict)

In [6]:
denmark_hap

Unnamed: 0,record_no,date,region,week,covid_test,mental_health_condition,age,gender,confidence_NHS,happiness_2wk_comparison,...,public_sector,community_belonging,belonging_change,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,0,09/04/2020 10:42,Nordjylland,week 1,"No, I have not",No,25,Male,,,...,,,,,,,,,,
1,1,09/04/2020 10:44,Syddanmark,week 1,"No, I have not",No,42,Male,,,...,,,,,,,,,,
2,2,09/04/2020 10:44,Midtjylland,week 1,"No, I have not",No,63,Male,,,...,,,,,,,,,,
3,3,09/04/2020 10:45,Midtjylland,week 1,"No, I have not",No,30,Female,,,...,,,,,,,,,,
4,4,09/04/2020 10:46,Nordjylland,week 1,"Yes, and I have not received my results from t...",No,30,Male,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16048,16048,16/11/2020 15:48,Midtjylland,week 16,"No, I have not",No,56,Male,A fair amount of confidence,,...,,,,,,,,,,
16049,16049,16/11/2020 15:50,Midtjylland,week 16,"No, I have not",No,68,Female,A lot of confidence,,...,,,,,,,,,,
16050,16050,16/11/2020 15:51,Syddanmark,week 16,"No, I have not",No,56,Female,A lot of confidence,,...,,,,,,,,,,
16051,16051,16/11/2020 15:52,Sjælland,week 16,"No, I have not",No,20,Male,A lot of confidence,,...,,,,,,,,,,


## Filtering columns further 

In [7]:
cols_filter = ['date','week','region','mental_health_condition','age','gender','happiness_rating']
denmark_hap_filter=denmark_hap[cols_filter]
denmark_hap_filter

Unnamed: 0,date,week,region,mental_health_condition,age,gender,happiness_rating
0,09/04/2020 10:42,week 1,Nordjylland,No,25,Male,
1,09/04/2020 10:44,week 1,Syddanmark,No,42,Male,
2,09/04/2020 10:44,week 1,Midtjylland,No,63,Male,
3,09/04/2020 10:45,week 1,Midtjylland,No,30,Female,
4,09/04/2020 10:46,week 1,Nordjylland,No,30,Male,
...,...,...,...,...,...,...,...
16048,16/11/2020 15:48,week 16,Midtjylland,No,56,Male,8
16049,16/11/2020 15:50,week 16,Midtjylland,No,68,Female,9
16050,16/11/2020 15:51,week 16,Syddanmark,No,56,Female,7
16051,16/11/2020 15:52,week 16,Sjælland,No,20,Male,7


In [8]:
## Since no happiness rating data available for week 1, drop rows corresponding to this timeframe
denmark_hap_filter.drop(denmark_hap_filter.loc[denmark_hap_filter['week']=='week 1'].index, inplace=True)

In [9]:
denmark_hap_filter['date'] = pd.to_datetime(denmark_hap_filter.date)
denmark_hap_filter['date'] = denmark_hap_filter['date'].dt.date

In [10]:
denmark_hap_filter['happiness_rating'] = denmark_hap_filter['happiness_rating'].astype(float)

In [11]:
denmark_hap_filter

Unnamed: 0,date,week,region,mental_health_condition,age,gender,happiness_rating
1006,2020-04-28,week 2,Nordjylland,No,29,Female,7.0
1007,2020-04-28,week 2,Sjælland,No,34,Male,5.0
1008,2020-04-28,week 2,Nordjylland,No,33,Male,3.0
1009,2020-04-28,week 2,Syddanmark,No,56,Female,8.0
1010,2020-04-28,week 2,Hovedstaden,No,22,Female,7.0
...,...,...,...,...,...,...,...
16048,2020-11-16,week 16,Midtjylland,No,56,Male,8.0
16049,2020-11-16,week 16,Midtjylland,No,68,Female,9.0
16050,2020-11-16,week 16,Syddanmark,No,56,Female,7.0
16051,2020-11-16,week 16,Sjælland,No,20,Male,7.0


## Exported to clean csv titled denmark_hap_cleaned.csv

In [12]:
# Making clean csv with these changes
denmark_hap_filter.to_csv('../data/cleaned_data/2020/denmark_hap_cleaned.csv')

## Now looking at COVID-19 cases data

In [13]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to Denmark and cleaning df

In [14]:
denmark_cov = covid_global.loc[covid_global['Country/Region']=='Denmark']
denmark_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
101,Faroe Islands,Denmark,61.8926,-6.9118,0,0,0,0,0,0,...,500,500,500,500,502,502,503,503,503,505
102,Greenland,Denmark,71.7069,-42.6043,0,0,0,0,0,0,...,18,18,18,18,18,18,18,18,18,18
103,,Denmark,56.2639,9.5018,0,0,0,0,0,0,...,73021,74204,75395,76718,78354,79352,80481,81949,83535,85140


In [15]:
date_cols = denmark_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [16]:
denmark_cov = denmark_cov.melt(id_vars = denmark_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')
denmark_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,Faroe Islands,Denmark,61.8926,-6.9118,1/22/20,0
1,Greenland,Denmark,71.7069,-42.6043,1/22/20,0
2,,Denmark,56.2639,9.5018,1/22/20,0
3,Faroe Islands,Denmark,61.8926,-6.9118,1/23/20,0
4,Greenland,Denmark,71.7069,-42.6043,1/23/20,0
...,...,...,...,...,...,...
946,Greenland,Denmark,71.7069,-42.6043,12/2/20,18
947,,Denmark,56.2639,9.5018,12/2/20,83535
948,Faroe Islands,Denmark,61.8926,-6.9118,12/3/20,505
949,Greenland,Denmark,71.7069,-42.6043,12/3/20,18


In [17]:
denmark_cov['date']=pd.to_datetime(denmark_cov['date'])

## Exported to clean csv titled denmark_covid.csv

In [18]:
# Making clean csv with these changes
denmark_cov.to_csv('../data/cleaned_data/2020/denmark_covid.csv')