# 2020 Mexico Data Exploration and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns

import warnings
warnings.simplefilter('ignore')
import statsmodels.formula.api as smf

## Purpose of this notebook

- initial cleaning and look at raw data files for happiness ratings in 2020 for Mexico and COVID-19 cases in Mexico
- create cleaned versions of files for analysis notebook

## Load datasets into dataframes

In [2]:
mexico_hap = pd.read_csv('../data/raw_data/2020/mexico.csv')
covid_global=pd.read_csv('../data/raw_data/2020/covid19_cases_global.csv')

## Start with Mexico dataset for happiness ratings

In [3]:
mexico_hap

Unnamed: 0,RecordNo,endtime,qweek,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,...,w5_96,w5_99,w6,w7,w9_1,w9_2,w9_3,w9_4,w9_5,disability
0,0,03/04/2020 14:30,week 1,3,15,2,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
1,1,03/04/2020 14:32,week 1,1,4,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
2,2,03/04/2020 14:35,week 1,3,4,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
3,3,03/04/2020 14:38,week 1,1,1,1,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
4,4,03/04/2020 14:38,week 1,1,0,0,"No, I have not","No, they have not",No,No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12007,12007,24/09/2020 17:28,week 12,5,2,2,"Yes, and I have not received my results from t...","No, they have not",No,No,...,,,Yes,3,More than half of the time,More than half of the time,Less than half of the time,Less than half of the time,Most of the time,"Yes, limited a little"
12008,12008,24/09/2020 17:33,week 12,4,2,1,"No, I have not","No, they have not",No,No,...,No,No,Don't Know,5 Agree strongly,Most of the time,Most of the time,More than half of the time,Less than half of the time,Less than half of the time,No
12009,12009,24/09/2020 17:38,week 12,3,4,0,"No, I have not","Yes, and they tested negative",No,No,...,No,No,Don't Know,5 Agree strongly,More than half of the time,More than half of the time,Most of the time,Most of the time,Most of the time,No
12010,12010,24/09/2020 18:34,week 12,3,10,0,"No, I have not",Not sure,No,No,...,,,Don't Know,3,All the time,More than half of the time,More than half of the time,More than half of the time,More than half of the time,No


#### Observations

Going to need to rename columns and figure out which columns I will actually use.

## Selecting and renaming columns relevant for this data analysis

In [4]:
mexico_hap=mexico_hap[['RecordNo', 'endtime','qweek','i3_health','d1_health_12','age','gender','CORE_B2_4','cantril_ladder','PHQ4_1','PHQ4_2','PHQ4_3','PHQ4_4','r1_1','r1_7','ox3_1','ox3_2','ox3_3','ox3_6','ox6','w2','w3','w4_1','w4_2','w4_7','w4_8','w4_9','w9_1','w9_2']]

In [5]:
col_dict = {
    'RecordNo':'record_no',
    'endtime':'date',
    'qweek':'week',
    'i3_health':'covid_test',
    'd1_health_12':'mental_health_condition',
    'CORE_B2_4':'happiness_2wk_comparison',
    'cantril_ladder':'happiness_rating',
    'PHQ4_1':'little_interest',
    'PHQ4_2':'negative_feelings',
    'PHQ4_3':'anxiety',
    'PHQ4_4':'worry',
    'r1_1':'perceptions_covid',
    'r1_7':'covid_effect',
    'ox3_1':'social_norms',
    'ox3_2':'social_expectations',
    'ox3_3':'appropriate_behaviors',
    'ox3_6':'follow_norms',
    'ox5_2':'trust_gvt',
    'ox6':'public_sector',
    'w2':'community_belonging',
    'w3':'belonging_change',
    'w4_1':'spoken_mental',
    'w4_2':'medication_mental',
    'w4_7':'seek_mental',
    'w4_8':'professional_mental',
    'w4_9':'readings_mental',
    'w9_1':'cheerful',
    'w9_2':'relaxed',}
mexico_hap = mexico_hap.rename(columns = col_dict)

In [6]:
mexico_hap

Unnamed: 0,record_no,date,week,covid_test,mental_health_condition,age,gender,happiness_2wk_comparison,happiness_rating,little_interest,...,public_sector,community_belonging,belonging_change,spoken_mental,medication_mental,seek_mental,professional_mental,readings_mental,cheerful,relaxed
0,0,03/04/2020 14:30,week 1,"No, I have not",No,39,Male,,,,...,,,,,,,,,,
1,1,03/04/2020 14:32,week 1,"No, I have not",No,34,Female,,,,...,,,,,,,,,,
2,2,03/04/2020 14:35,week 1,"No, I have not",No,45,Female,,,,...,,,,,,,,,,
3,3,03/04/2020 14:38,week 1,"No, I have not",No,45,Male,,,,...,,,,,,,,,,
4,4,03/04/2020 14:38,week 1,"No, I have not",No,28,Female,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12007,12007,24/09/2020 17:28,week 12,"Yes, and I have not received my results from t...",No,20,Male,,8,More than half the days,...,,Somewhat strong,Increased,No,No,No,No,Yes,More than half of the time,More than half of the time
12008,12008,24/09/2020 17:33,week 12,"No, I have not",No,24,Male,,8,Not at all,...,,Somewhat weak,No change,Yes,No,No,No,Yes,Most of the time,Most of the time
12009,12009,24/09/2020 17:38,week 12,"No, I have not",No,64,Female,,8,Several days,...,,Very weak,Decreased,Yes,No,No,No,Yes,More than half of the time,More than half of the time
12010,12010,24/09/2020 18:34,week 12,"No, I have not",No,24,Male,,8,Not at all,...,,Very weak,No change,No,No,No,No,Yes,All the time,More than half of the time


## Filtering columns further 

In [7]:
cols_filter = ['date','week','mental_health_condition','age','gender','happiness_rating']
mexico_hap_filter=mexico_hap[cols_filter]
mexico_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
0,03/04/2020 14:30,week 1,No,39,Male,
1,03/04/2020 14:32,week 1,No,34,Female,
2,03/04/2020 14:35,week 1,No,45,Female,
3,03/04/2020 14:38,week 1,No,45,Male,
4,03/04/2020 14:38,week 1,No,28,Female,
...,...,...,...,...,...,...
12007,24/09/2020 17:28,week 12,No,20,Male,8
12008,24/09/2020 17:33,week 12,No,24,Male,8
12009,24/09/2020 17:38,week 12,No,64,Female,8
12010,24/09/2020 18:34,week 12,No,24,Male,8


In [8]:
## Since no happiness rating data available for week 1, drop rows corresponding to this timeframe
mexico_hap_filter.drop(mexico_hap_filter.loc[mexico_hap_filter['week']=='week 1'].index, inplace=True)

In [9]:
mexico_hap_filter['date'] = pd.to_datetime(mexico_hap_filter.date)
mexico_hap_filter['date'] = mexico_hap_filter['date'].dt.date

In [10]:
mexico_hap_filter['happiness_rating'] = mexico_hap_filter['happiness_rating'].astype(float)

In [11]:
mexico_hap_filter

Unnamed: 0,date,week,mental_health_condition,age,gender,happiness_rating
1004,2020-04-27,week 2,No,20,Female,5.0
1005,2020-04-27,week 2,No,29,Female,8.0
1006,2020-04-27,week 2,No,69,Male,7.0
1007,2020-04-27,week 2,No,28,Female,6.0
1008,2020-04-27,week 2,No,51,Male,7.0
...,...,...,...,...,...,...
12007,2020-09-24,week 12,No,20,Male,8.0
12008,2020-09-24,week 12,No,24,Male,8.0
12009,2020-09-24,week 12,No,64,Female,8.0
12010,2020-09-24,week 12,No,24,Male,8.0


## Exported to clean csv titled mexico_hap_cleaned.csv

In [12]:
# Making clean csv with these changes
mexico_hap_filter.to_csv('../data/cleaned_data/2020/mexico_hap_cleaned.csv')

## Now looking at COVID-19 cases data

In [13]:
covid_global

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,45174,45384,45600,45723,45844,46116,46274,46717,46980,46718
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,34300,34944,35600,36245,36790,37625,38182,39014,39719,40501
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,77000,78025,79110,80168,81212,82221,83199,84152,85084,85927
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6351,6428,6534,6610,6610,6712,6745,6790,6842,6904
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,14742,14821,14920,15008,15087,15103,15139,15251,15319,15361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,75007,76727,78493,80429,81890,83585,85647,88004,90192,92708
267,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
268,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2114,2124,2137,2148,2160,2177,2191,2197,2217,2239
269,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,17466,17535,17553,17569,17589,17608,17647,17665,17700,17730


## Selecting rows that correspond to Mexico and cleaning df

In [14]:
mexico_cov = covid_global.loc[covid_global['Country/Region']=='Mexico']
mexico_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20
180,,Mexico,23.6345,-102.5528,0,0,0,0,0,0,...,1060152,1070487,1078594,1078594,1101403,1107071,1113543,1122362,1133613,1144643


In [15]:
date_cols = mexico_cov.columns[4:]
date_cols

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/24/20', '11/25/20', '11/26/20', '11/27/20', '11/28/20', '11/29/20',
       '11/30/20', '12/1/20', '12/2/20', '12/3/20'],
      dtype='object', length=317)

In [16]:
mexico_cov = mexico_cov.melt(id_vars = mexico_cov.columns[:4],
                                    value_vars = date_cols,
                                    var_name = 'date',
                                    value_name = 'cases')
mexico_cov

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,cases
0,,Mexico,23.6345,-102.5528,1/22/20,0
1,,Mexico,23.6345,-102.5528,1/23/20,0
2,,Mexico,23.6345,-102.5528,1/24/20,0
3,,Mexico,23.6345,-102.5528,1/25/20,0
4,,Mexico,23.6345,-102.5528,1/26/20,0
...,...,...,...,...,...,...
312,,Mexico,23.6345,-102.5528,11/29/20,1107071
313,,Mexico,23.6345,-102.5528,11/30/20,1113543
314,,Mexico,23.6345,-102.5528,12/1/20,1122362
315,,Mexico,23.6345,-102.5528,12/2/20,1133613


In [17]:
mexico_cov['date']=pd.to_datetime(mexico_cov['date'])

## Exported to clean csv titled mexico_covid.csv

In [18]:
# Making clean csv with these changes
mexico_cov.to_csv('../data/cleaned_data/2020/mexico_covid.csv')