# 2020 India Data Exploration and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for India and COVID-19 cases in India
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes - India

In [2]:
india_hap = pd.read_csv('../data/raw_data/2020/india.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with India dataset for happiness

In [3]:
india_hap

Unnamed: 0,RecordNo,endtime,gender,state,qweek,i1_health,i2_health,i7a_health,i3_health,i4_health,...,w5_96,w5_99,w6,w7,w9_1,w9_2,w9_3,w9_4,w9_5,disability
0,12,01/04/2020 16:13,Female,Gujarat,week 1,3,2,0,"No, I have not","No, they have not",...,,,,,,,,,,
1,13,01/04/2020 16:26,Female,Karnataka,week 1,1,2,0,,,...,,,,,,,,,,
2,14,01/04/2020 16:29,Male,Karnataka,week 1,0,0,0,,,...,,,,,,,,,,
3,15,01/04/2020 16:30,Male,Maharashtra,week 1,2,0,0,"No, I have not","No, they have not",...,,,,,,,,,,
4,16,01/04/2020 16:30,Female,Gujarat,week 1,5,4,24,"No, I have not","No, they have not",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16140,16152,20/09/2020 12:49,Female,West Bengal,week 16,2,1,2,"No, I have not","No, they have not",...,No,No,Yes,4,Most of the time,More than half of the time,Most of the time,More than half of the time,More than half of the time,"Yes, limited a little"
16141,16153,20/09/2020 12:51,Female,Delhi,week 16,0,0,0,"No, I have not",,...,,,Yes,Don't know,Some of the time,Some of the time,Less than half of the time,Some of the time,More than half of the time,Prefer not to say / skipped
16142,16154,20/09/2020 13:29,Female,Assam,week 16,4,0,0,"Yes, and I tested negative","Yes, and they tested negative",...,,,Don't Know,5 Agree strongly,Less than half of the time,Some of the time,More than half of the time,Most of the time,At no time,No
16143,16155,20/09/2020 14:00,Female,Jharkhand,week 16,3,7,1,"No, I have not","No, they have not",...,,,No,1 Disagree strongly,All the time,All the time,All the time,All the time,Most of the time,No


## Selecting and renaming columns relevant for analysis

In [4]:
india_hap_df =india_hap[['RecordNo', 'endtime','qweek','i3_health','d1_health_12','age','gender','WCRex2','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','WCRex1','r1_1','r1_7','ox3_1','ox3_2','ox3_3','ox3_6','ox5_2','ox6','w2','w3','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek':'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'WCRex2':'confidence_NHS',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'WCRex1':'gvt_rating',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_2':'social_expectations',
    'ox3_3':'appropriate_behaviors',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
india_hap_df = india_hap_df.rename(columns = col_dict)

In [6]:
india_hap_df

Unnamed: 0,record_no,date,week,covid_test,mental_health_condition,age,gender,confidence_NHS,happiness_2wk_comparison,happiness_rating,...,public_sector,community_belonging,belonging_change,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,12,01/04/2020 16:13,week 1,"No, I have not",,37,Female,,,,...,,,,,,,,,,
1,13,01/04/2020 16:26,week 1,,,36,Female,,,,...,,,,,,,,,,
2,14,01/04/2020 16:29,week 1,,,26,Male,,,,...,,,,,,,,,,
3,15,01/04/2020 16:30,week 1,"No, I have not",No,28,Male,,,,...,,,,,,,,,,
4,16,01/04/2020 16:30,week 1,"No, I have not",No,34,Female,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16140,16152,20/09/2020 12:49,week 16,"No, I have not",No,55,Female,A fair amount of confidence,,8,...,,Somewhat strong,Increased,Yes,Yes,No,No,No,Most of the time,More than half of the time
16141,16153,20/09/2020 12:51,week 16,"No, I have not",No,48,Female,A fair amount of confidence,,8,...,,Somewhat strong,No change,No,No,No,No,Yes,Some of the time,Some of the time
16142,16154,20/09/2020 13:29,week 16,"Yes, and I tested negative",No,50,Female,A fair amount of confidence,,3,...,,Somewhat strong,No change,No,No,No,No,No,Less than half of the time,Some of the time
16143,16155,20/09/2020 14:00,week 16,"No, I have not",No,51,Female,A lot of confidence,,10,...,,Very strong,Increased,No,No,No,No,No,All the time,All the time


## Filtering columns

In [7]:
cols_filter = ['date','week','mental_health_condition','age','gender','happiness_rating']
india_hap_filter=india_hap_df[cols_filter]
india_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
0,01/04/2020 16:13,week 1,,37,Female,
1,01/04/2020 16:26,week 1,,36,Female,
2,01/04/2020 16:29,week 1,,26,Male,
3,01/04/2020 16:30,week 1,No,28,Male,
4,01/04/2020 16:30,week 1,No,34,Female,
...,...,...,...,...,...,...
16140,20/09/2020 12:49,week 16,No,55,Female,8
16141,20/09/2020 12:51,week 16,No,48,Female,8
16142,20/09/2020 13:29,week 16,No,50,Female,3
16143,20/09/2020 14:00,week 16,No,51,Female,10


In [8]:
## No happiness rating data available for these weeks
india_hap_filter = india_hap_filter.drop(india_hap_filter[india_hap_filter.week.isin(['week 1', 'week 2','week 3','week 4'])].index)
india_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
4004,30/04/2020 16:38,week 5,No,34,Female,5
4005,30/04/2020 16:38,week 5,No,31,Female,5
4006,30/04/2020 16:43,week 5,No,27,Female,8
4007,30/04/2020 16:43,week 5,No,28,Female,5
4008,30/04/2020 16:44,week 5,No,30,Male,8
...,...,...,...,...,...,...
16140,20/09/2020 12:49,week 16,No,55,Female,8
16141,20/09/2020 12:51,week 16,No,48,Female,8
16142,20/09/2020 13:29,week 16,No,50,Female,3
16143,20/09/2020 14:00,week 16,No,51,Female,10


In [9]:
india_hap_filter['date'] = pd.to_datetime(india_hap_filter.date)
india_hap_filter['date'] = india_hap_filter['date'].dt.date

In [10]:
india_hap_filter['happiness_rating'] = india_hap_filter['happiness_rating'].astype(float)

In [11]:
india_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
4004,2020-04-30,week 5,No,34,Female,5.0
4005,2020-04-30,week 5,No,31,Female,5.0
4006,2020-04-30,week 5,No,27,Female,8.0
4007,2020-04-30,week 5,No,28,Female,5.0
4008,2020-04-30,week 5,No,30,Male,8.0
...,...,...,...,...,...,...
16140,2020-09-20,week 16,No,55,Female,8.0
16141,2020-09-20,week 16,No,48,Female,8.0
16142,2020-09-20,week 16,No,50,Female,3.0
16143,2020-09-20,week 16,No,51,Female,10.0


## Exported to clean csv titled india_hap_cleaned.csv

In [12]:
# Making clean csv with these changes
india_hap_filter.to_csv('../data/cleaned_data/2020/india_hap_cleaned.csv')

## Now looking at COVID-19 cases data- India

In [13]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to India

In [14]:
india_cov = covid_global.loc[covid_global['Country/Region']=='India']
india_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
145,,India,20.593684,78.96288,0,0,0,0,0,0,...,9222216,9266705,9309787,9351109,9392919,9431691,9462809,9499413,9534964,9571559


In [15]:
date_cols = india_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [16]:
india_cov = india_cov.melt(id_vars = india_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')

In [17]:
india_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,,India,20.593684,78.96288,1/22/20,0
1,,India,20.593684,78.96288,1/23/20,0
2,,India,20.593684,78.96288,1/24/20,0
3,,India,20.593684,78.96288,1/25/20,0
4,,India,20.593684,78.96288,1/26/20,0
...,...,...,...,...,...,...
312,,India,20.593684,78.96288,11/29/20,9431691
313,,India,20.593684,78.96288,11/30/20,9462809
314,,India,20.593684,78.96288,12/1/20,9499413
315,,India,20.593684,78.96288,12/2/20,9534964


In [18]:
india_cov['date']=pd.to_datetime(india_cov['date'])

## Exported to clean csv titled india_covid.csv

In [19]:
# Making clean csv with these changes
india_cov.to_csv('../data/cleaned_data/2020/india_covid.csv')