# 2020 Malaysia Data Exploration and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for Malaysia and COVID-19 cases in Malaysia
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes

In [2]:
malaysia_hap = pd.read_csv('../data/raw_data/2020/malaysia.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with Malaysia dataset for happiness ratings

In [3]:
malaysia_hap

Unnamed: 0,RecordNo,endtime,qweek,i1_health,i2_health,i7a_health,i5a_health,i6_health,i8_health,i9_health,...,w5_99,w5_98,w6,w7,w9_1,w9_2,w9_3,w9_4,w9_5,disability
0,0,09/04/2020 10:48,week 1,0,0,0,,,,Yes,...,,,,,,,,,,
1,1,09/04/2020 10:49,week 1,0,5,1,,,,Yes,...,,,,,,,,,,
2,2,09/04/2020 10:51,week 1,4,3,1,,,,Yes,...,,,,,,,,,,
3,3,09/04/2020 10:55,week 1,8,5,1,No,Frequently,No,Yes,...,,,,,,,,,,
4,4,09/04/2020 10:57,week 1,6,0,2,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12128,12128,24/09/2020 01:50,week 12,8,100,2,Yes,Sometimes,,,...,,,Yes,3,Some of the time,Some of the time,Some of the time,Some of the time,Some of the time,"Yes, limited a little"
12129,12129,24/09/2020 01:54,week 12,8,8,2,,,,,...,,,No,4,At no time,Some of the time,Some of the time,At no time,At no time,No
12130,12130,24/09/2020 02:01,week 12,3,2,3,No,Rarely,,,...,,,Yes,5 Agree strongly,Most of the time,Most of the time,Most of the time,Most of the time,Most of the time,"Yes, limited a little"
12131,12131,24/09/2020 02:11,week 12,7,1,0,,,,,...,,,Don't Know,2,Most of the time,More than half of the time,Most of the time,More than half of the time,Most of the time,Prefer not to say / skipped


## Selecting and renaming columns relevant for this data analysis

In [4]:
malaysia_hap=malaysia_hap[['RecordNo', 'endtime','qweek','i3_health','d1_health_12','age','gender','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','r1_1','r1_7','ox3_1','ox3_2','ox3_3','ox3_6','ox6','w2','w3','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek':'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_2':'social_expectations',
    'ox3_3':'appropriate_behaviors',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
malaysia_hap = malaysia_hap.rename(columns = col_dict)

In [6]:
malaysia_hap

Unnamed: 0,record_no,date,week,covid_test,mental_health_condition,age,gender,happiness_2wk_comparison,happiness_rating,little_interest,...,public_sector,community_belonging,belonging_change,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,0,09/04/2020 10:48,week 1,"No, I have not",No,20,Female,,,,...,,,,,,,,,,
1,1,09/04/2020 10:49,week 1,"No, I have not",No,29,Male,,,,...,,,,,,,,,,
2,2,09/04/2020 10:51,week 1,"No, I have not",No,56,Male,,,,...,,,,,,,,,,
3,3,09/04/2020 10:55,week 1,"Yes, and I tested negative",No,19,Male,,,,...,,,,,,,,,,
4,4,09/04/2020 10:57,week 1,"No, I have not",No,37,Male,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12128,12128,24/09/2020 01:50,week 12,"No, I have not",No,19,Male,,5,Several days,...,,Somewhat weak,Increased,No,No,Yes,No,Yes,Some of the time,Some of the time
12129,12129,24/09/2020 01:54,week 12,,No,24,Male,,5,Prefer not to say,...,,Somewhat strong,Increased,No,No,No,No,No,At no time,Some of the time
12130,12130,24/09/2020 02:01,week 12,"Yes, and I tested negative",No,21,Male,,9,Several days,...,,Very strong,Increased,No,Yes,No,No,No,Most of the time,Most of the time
12131,12131,24/09/2020 02:11,week 12,"No, I have not",No,20,Male,,8,Several days,...,,Very weak,No change,No,No,No,No,No,Most of the time,More than half of the time


## Filtering columns further 

In [7]:
cols_filter = ['date','week','mental_health_condition','age','gender','happiness_rating']
malaysia_hap_filter=malaysia_hap[cols_filter]
malaysia_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
0,09/04/2020 10:48,week 1,No,20,Female,
1,09/04/2020 10:49,week 1,No,29,Male,
2,09/04/2020 10:51,week 1,No,56,Male,
3,09/04/2020 10:55,week 1,No,19,Male,
4,09/04/2020 10:57,week 1,No,37,Male,
...,...,...,...,...,...,...
12128,24/09/2020 01:50,week 12,No,19,Male,5
12129,24/09/2020 01:54,week 12,No,24,Male,5
12130,24/09/2020 02:01,week 12,No,21,Male,9
12131,24/09/2020 02:11,week 12,No,20,Male,8


In [8]:
## Since no happiness rating data available for week 1, drop rows corresponding to this timeframe
malaysia_hap_filter.drop(malaysia_hap_filter.loc[malaysia_hap_filter['week']=='week 1'].index, inplace=True)

In [9]:
malaysia_hap_filter['date'] = pd.to_datetime(malaysia_hap_filter.date)
malaysia_hap_filter['date'] = malaysia_hap_filter['date'].dt.date

In [10]:
malaysia_hap_filter['happiness_rating'] = malaysia_hap_filter['happiness_rating'].astype(float)

In [11]:
malaysia_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
1016,2020-04-28,week 2,No,33,Female,10.0
1017,2020-04-28,week 2,No,28,Male,3.0
1018,2020-04-28,week 2,No,37,Male,1.0
1019,2020-04-28,week 2,No,37,Female,8.0
1020,2020-04-28,week 2,No,25,Female,5.0
...,...,...,...,...,...,...
12128,2020-09-24,week 12,No,19,Male,5.0
12129,2020-09-24,week 12,No,24,Male,5.0
12130,2020-09-24,week 12,No,21,Male,9.0
12131,2020-09-24,week 12,No,20,Male,8.0


## Exported to clean csv titled malaysia_hap_cleaned.csv

In [12]:
# Making clean csv with these changes
malaysia_hap_filter.to_csv('../data/cleaned_data/2020/malaysia_hap_cleaned.csv')

## Now looking at COVID-19 cases data

In [13]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to Malaysia and cleaning df

In [14]:
malaysia_cov = covid_global.loc[covid_global['Country/Region']=='Malaysia']
malaysia_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
173,,Malaysia,4.210484,101.975766,0,0,0,3,4,4,...,58847,59817,60752,61861,63176,64485,65697,67169,68020,69095


In [15]:
date_cols = malaysia_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [16]:
malaysia_cov = malaysia_cov.melt(id_vars = malaysia_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')
malaysia_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,,Malaysia,4.210484,101.975766,1/22/20,0
1,,Malaysia,4.210484,101.975766,1/23/20,0
2,,Malaysia,4.210484,101.975766,1/24/20,0
3,,Malaysia,4.210484,101.975766,1/25/20,3
4,,Malaysia,4.210484,101.975766,1/26/20,4
...,...,...,...,...,...,...
312,,Malaysia,4.210484,101.975766,11/29/20,64485
313,,Malaysia,4.210484,101.975766,11/30/20,65697
314,,Malaysia,4.210484,101.975766,12/1/20,67169
315,,Malaysia,4.210484,101.975766,12/2/20,68020


In [17]:
malaysia_cov['date']=pd.to_datetime(malaysia_cov['date'])

## Exported to clean csv titled malaysia_covid.csv

In [18]:
# Making clean csv with these changes
malaysia_cov.to_csv('../data/cleaned_data/2020/malaysia_covid.csv')