# 2020 Saudi Arabia Data Exploration and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for Saudi Arabia and COVID-19 cases in Saudi Arabia
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes

In [2]:
sa_hap = pd.read_csv('../data/raw_data/2020/saudi_arabia.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with Saudi Arabia dataset for happiness ratings

In [3]:
sa_hap

Unnamed: 0,RecordNo,endtime,age,qweek,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,...,w5_99,w5_98,w6,w7,w9_1,w9_2,w9_3,w9_4,w9_5,disability
0,0,08/04/2020 15:48,28,week 1,0,1,1,"No, I have not","No, they have not",No,...,,,,,,,,,,
1,1,08/04/2020 15:50,36,week 1,0,0,1,"No, I have not","No, they have not",No,...,,,,,,,,,,
2,2,08/04/2020 15:49,35,week 1,5,4,3,"Yes, and I tested positive","Yes, and they tested positive",No,...,,,,,,,,,,
3,3,08/04/2020 15:50,26,week 1,2,1,1,"No, I have not","No, they have not",No,...,,,,,,,,,,
4,4,08/04/2020 15:52,52,week 1,7,4,1,"No, I have not","No, they have not",No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11445,11445,29/09/2020 09:59,22,week 12,9,7,1,"Yes, and I tested positive","Yes, and they tested negative",No,...,,,Yes,3,More than half of the time,Some of the time,All the time,Less than half of the time,At no time,"Yes, limited a little"
11446,11446,29/09/2020 10:31,19,week 12,6,5,0,"No, I have not","No, they have not",No,...,,,Don't Know,3,Some of the time,More than half of the time,Some of the time,Some of the time,Some of the time,No
11447,11447,29/09/2020 10:38,55,week 12,4,0,1,"Yes, and I tested negative","Yes, and they tested negative",No,...,,,Yes,4,Most of the time,More than half of the time,More than half of the time,Most of the time,All the time,No
11448,11448,29/09/2020 16:15,48,week 12,8,12,0,"No, I have not","No, they have not",No,...,No,No,Yes,4,Some of the time,Some of the time,Some of the time,At no time,At no time,No


### Observations

Lots of rows oh my 11450 and 300 columns. Most of them are not of interest for this analysis and have to refer to a codebook.

## Selecting and renaming columns relevant for this data analysis

In [4]:
sa_hap=sa_hap[['RecordNo', 'endtime','qweek','i3_health','d1_health_12','age','gender','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','r1_1','r1_7','ox3_1','ox3_2','ox3_3','ox3_6','ox6','w2','w3','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek':'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_2':'social_expectations',
    'ox3_3':'appropriate_behaviors',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
sa_hap = sa_hap.rename(columns = col_dict)

In [6]:
sa_hap

Unnamed: 0,record_no,date,week,covid_test,mental_health_condition,age,gender,happiness_2wk_comparison,happiness_rating,little_interest,...,public_sector,community_belonging,belonging_change,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,0,08/04/2020 15:48,week 1,"No, I have not",No,28,Male,,,,...,,,,,,,,,,
1,1,08/04/2020 15:50,week 1,"No, I have not",No,36,Male,,,,...,,,,,,,,,,
2,2,08/04/2020 15:49,week 1,"Yes, and I tested positive",No,35,Male,,,,...,,,,,,,,,,
3,3,08/04/2020 15:50,week 1,"No, I have not",No,26,Female,,,,...,,,,,,,,,,
4,4,08/04/2020 15:52,week 1,"No, I have not",No,52,Male,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11445,11445,29/09/2020 09:59,week 12,"Yes, and I tested positive",No,22,Female,,7,Several days,...,,Somewhat weak,Increased,No,No,No,No,No,More than half of the time,Some of the time
11446,11446,29/09/2020 10:31,week 12,"No, I have not",No,19,Female,,5,Nearly every day,...,,Very weak,No change,No,No,No,No,Yes,Some of the time,More than half of the time
11447,11447,29/09/2020 10:38,week 12,"Yes, and I tested negative",No,55,Female,,7,Not at all,...,,Very strong,No change,No,No,No,No,No,Most of the time,More than half of the time
11448,11448,29/09/2020 16:15,week 12,"No, I have not",No,48,Female,,4,Several days,...,,Somewhat weak,Decreased,Yes,Yes,No,No,No,Some of the time,Some of the time


## Filtering columns further 

In [7]:
cols_filter = ['date','week','mental_health_condition','age','gender','happiness_rating']
sa_hap_filter=sa_hap[cols_filter]
sa_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
0,08/04/2020 15:48,week 1,No,28,Male,
1,08/04/2020 15:50,week 1,No,36,Male,
2,08/04/2020 15:49,week 1,No,35,Male,
3,08/04/2020 15:50,week 1,No,26,Female,
4,08/04/2020 15:52,week 1,No,52,Male,
...,...,...,...,...,...,...
11445,29/09/2020 09:59,week 12,No,22,Female,7
11446,29/09/2020 10:31,week 12,No,19,Female,5
11447,29/09/2020 10:38,week 12,No,55,Female,7
11448,29/09/2020 16:15,week 12,No,48,Female,4


In [8]:
## Since no happiness rating data available for week 1, drop rows corresponding to this timeframe
sa_hap_filter.drop(sa_hap_filter.loc[sa_hap_filter['week']=='week 1'].index, inplace=True)

In [9]:
sa_hap_filter['date'] = pd.to_datetime(sa_hap_filter.date)
sa_hap_filter['date'] = sa_hap_filter['date'].dt.date

In [10]:
sa_hap_filter['happiness_rating'] = sa_hap_filter['happiness_rating'].astype(float)

In [11]:
sa_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
940,2020-04-28,week 2,No,30,Male,7.0
941,2020-04-28,week 2,No,40,Female,10.0
942,2020-04-28,week 2,No,53,Male,6.0
943,2020-04-28,week 2,No,35,Male,7.0
944,2020-04-28,week 2,No,33,Male,6.0
...,...,...,...,...,...,...
11445,2020-09-29,week 12,No,22,Female,7.0
11446,2020-09-29,week 12,No,19,Female,5.0
11447,2020-09-29,week 12,No,55,Female,7.0
11448,2020-09-29,week 12,No,48,Female,4.0


## Exported to clean csv titled saudi_arabia_hap_cleaned.csv

In [12]:
# Making clean csv with these changes
sa_hap_filter.to_csv('../data/cleaned_data/2020/saudi_arabia_hap_cleaned.csv')

## Now looking at COVID-19 cases data

In [13]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to Saudi Arabia and cleaning df

In [14]:
sa_cov = covid_global.loc[covid_global['Country/Region']=='Saudi Arabia']
sa_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
218,,Saudi Arabia,23.885942,45.079162,0,0,0,0,0,0,...,355741,356067,356389,356691,356911,357128,357360,357623,357872,358102


In [15]:
date_cols = sa_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [16]:
sa_cov = sa_cov.melt(id_vars = sa_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')
sa_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,,Saudi Arabia,23.885942,45.079162,1/22/20,0
1,,Saudi Arabia,23.885942,45.079162,1/23/20,0
2,,Saudi Arabia,23.885942,45.079162,1/24/20,0
3,,Saudi Arabia,23.885942,45.079162,1/25/20,0
4,,Saudi Arabia,23.885942,45.079162,1/26/20,0
...,...,...,...,...,...,...
312,,Saudi Arabia,23.885942,45.079162,11/29/20,357128
313,,Saudi Arabia,23.885942,45.079162,11/30/20,357360
314,,Saudi Arabia,23.885942,45.079162,12/1/20,357623
315,,Saudi Arabia,23.885942,45.079162,12/2/20,357872


In [17]:
sa_cov['date']=pd.to_datetime(sa_cov['date'])

## Exported to clean csv titled sa_covid.csv

In [18]:
# Making clean csv with these changes
sa_cov.to_csv('../data/cleaned_data/2020/sa_covid.csv')