# 2020 China Data Exploration and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for China and COVID-19 cases in China
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes

In [2]:
china_hap = pd.read_csv('../data/raw_data/2020/china.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with China dataset for happiness ratings

In [3]:
china_hap

Unnamed: 0,RecordNo,endtime,qweek,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,...,w5_96,w5_99,w5_98,w7,w9_1,w9_2,w9_3,w9_4,w9_5,disability
0,0,02/04/2020 09:56,week 1,3,4,12,"No, I have not","No, they have not",Yes,No,...,,,,,,,,,,
1,1,02/04/2020 10:01,week 1,4,0,2,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
2,2,02/04/2020 10:18,week 1,3,2,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
3,3,02/04/2020 10:35,week 1,2,5,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
4,4,02/04/2020 10:41,week 1,2,0,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15980,15980,25/09/2020 11:05,week 16,0,4,2,"No, I have not","No, they have not",No,No,...,No,No,No,3,Less than half of the time,More than half of the time,More than half of the time,Less than half of the time,More than half of the time,我没有因此受到限制
15981,15981,25/09/2020 11:07,week 16,3,2,2,"No, I have not","Yes, and they tested negative",No,No,...,,,,5 Agree strongly,Most of the time,More than half of the time,More than half of the time,Most of the time,More than half of the time,我没有因此受到限制
15982,15982,25/09/2020 11:13,week 16,9,9,9,"No, I have not","No, they have not",No,No,...,,,,5 Agree strongly,All the time,All the time,All the time,All the time,All the time,"有的,限制很多"
15983,15983,25/09/2020 11:18,week 16,3,7,3,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,...,,,,Don't know,Most of the time,Most of the time,Most of the time,Most of the time,Most of the time,我没有因此受到限制


## Selecting and renaming columns relevant for this data analysis

In [4]:
china_hap=china_hap[['RecordNo', 'endtime','qweek','i3_health','d1_health_12','age','gender','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','r1_1','r1_7','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek':'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
china_hap = china_hap.rename(columns = col_dict)

In [6]:
china_hap

Unnamed: 0,record_no,date,week,covid_test,mental_health_condition,age,gender,happiness_2wk_comparison,happiness_rating,little_interest,...,worry,perceptions_covid,covid_effect,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,0,02/04/2020 09:56,week 1,"No, I have not",No,35,Male,,,,...,,,,,,,,,,
1,1,02/04/2020 10:01,week 1,"No, I have not",No,53,Male,,,,...,,,,,,,,,,
2,2,02/04/2020 10:18,week 1,"No, I have not",No,21,Female,,,,...,,,,,,,,,,
3,3,02/04/2020 10:35,week 1,"No, I have not",No,39,Female,,,,...,,,,,,,,,,
4,4,02/04/2020 10:41,week 1,"No, I have not",No,57,Female,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15980,15980,25/09/2020 11:05,week 16,"No, I have not",Yes,22,Male,,4,Nearly every day,...,Nearly every day,3,5,Yes,No,No,No,Yes,Less than half of the time,More than half of the time
15981,15981,25/09/2020 11:07,week 16,"No, I have not",No,30,Male,,6,More than half the days,...,Several days,6,6,No,No,No,Yes,No,Most of the time,More than half of the time
15982,15982,25/09/2020 11:13,week 16,"No, I have not",No,25,Male,,9,Nearly every day,...,Nearly every day,7 - Agree,7 - Agree,No,No,No,No,Yes,All the time,All the time
15983,15983,25/09/2020 11:18,week 16,"Yes, and I tested negative",No,31,Male,,7,Several days,...,Not at all,5,6,No,No,No,No,Yes,Most of the time,Most of the time


## Filtering columns further 

In [7]:
cols_filter = ['date','week','mental_health_condition','age','gender','happiness_rating']
china_hap_filter=china_hap[cols_filter]
china_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
0,02/04/2020 09:56,week 1,No,35,Male,
1,02/04/2020 10:01,week 1,No,53,Male,
2,02/04/2020 10:18,week 1,No,21,Female,
3,02/04/2020 10:35,week 1,No,39,Female,
4,02/04/2020 10:41,week 1,No,57,Female,
...,...,...,...,...,...,...
15980,25/09/2020 11:05,week 16,Yes,22,Male,4
15981,25/09/2020 11:07,week 16,No,30,Male,6
15982,25/09/2020 11:13,week 16,No,25,Male,9
15983,25/09/2020 11:18,week 16,No,31,Male,7


In [8]:
## Since no happiness rating data available for these weeks, drop rows corresponding to this timeframe
china_hap_filter = china_hap_filter.drop(china_hap_filter[china_hap_filter.week.isin(['week 1', 'week 2','week 3'])].index)

In [9]:
china_hap_filter['date'] = pd.to_datetime(china_hap_filter.date)
china_hap_filter['date'] = china_hap_filter['date'].dt.date

## Exported to clean csv titled china_hap_cleaned.csv

In [10]:
# Making clean csv with these changes
china_hap_filter.to_csv('../data/cleaned_data/2020/china_hap_cleaned.csv')

## Now looking at COVID-19 cases data

In [11]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to China and cleaning df

In [12]:
china_cov = covid_global.loc[covid_global['Country/Region']=='China']
china_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
58,Anhui,China,31.8257,117.2264,1,9,15,39,60,70,...,992,992,992,992,992,992,992,992,992,992
59,Beijing,China,40.1824,116.4142,14,22,36,41,68,80,...,950,950,950,950,950,950,951,951,951,951
60,Chongqing,China,30.0572,107.874,6,9,27,57,75,110,...,590,590,590,590,590,590,590,590,590,590
61,Fujian,China,26.0789,117.9874,1,5,10,18,35,59,...,479,480,481,483,487,487,489,490,490,492
62,Gansu,China,35.7518,104.2861,0,2,2,4,7,14,...,181,181,181,181,182,182,182,182,182,182
63,Guangdong,China,23.3417,113.4244,26,32,53,78,111,151,...,1984,1988,1988,1988,1988,1989,1989,1992,1996,1997
64,Guangxi,China,23.8298,108.7881,2,5,23,23,36,46,...,263,263,263,263,263,263,263,263,263,263
65,Guizhou,China,26.8154,106.8748,1,3,3,4,5,7,...,147,147,147,147,147,147,147,147,147,147
66,Hainan,China,19.1959,109.7453,4,5,8,19,22,33,...,171,171,171,171,171,171,171,171,171,171
67,Hebei,China,39.549,116.1306,1,1,2,8,13,18,...,373,373,373,373,373,373,373,373,373,373


In [13]:
date_cols = china_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [14]:
china_cov = china_cov.melt(id_vars = china_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')
china_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,Anhui,China,31.8257,117.2264,1/22/20,1
1,Beijing,China,40.1824,116.4142,1/22/20,14
2,Chongqing,China,30.0572,107.8740,1/22/20,6
3,Fujian,China,26.0789,117.9874,1/22/20,1
4,Gansu,China,35.7518,104.2861,1/22/20,0
...,...,...,...,...,...,...
10456,Tianjin,China,39.3054,117.3230,12/3/20,300
10457,Tibet,China,31.6927,88.0924,12/3/20,1
10458,Xinjiang,China,41.1129,85.2401,12/3/20,980
10459,Yunnan,China,24.9740,101.4870,12/3/20,219


In [15]:
china_cov['date']=pd.to_datetime(china_cov['date'])

## Exported to clean csv titled china_covid.csv

In [16]:
# Making clean csv with these changes
china_cov.to_csv('../data/cleaned_data/2020/china_covid.csv')