## 15-minute Data Cleaning Exercise

In breakout rooms, inspect & clean the full ```PHHR71FL-labeled.csv``` dataset. Try to answer the following questions for each. Discuss findings with the rest of your group. 

- Clean the dataset. What are the steps you took to clean your data?
- What is the final shape of your clean dataset?
- Save clean dataset as ```PHHR71FL-labeled-clean.csv```

In [1]:
import pandas as pd
import numpy as np
import json

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:,.4f}'.format)

In [2]:
df = pd.read_csv('data/PHHR71FL-labeled.csv', dtype = 'unicode')

In [3]:
#Load json dictionary
with open(f'data/PHHR71FL-data-dictionary.json', 'r') as file:
    col_dict = json.load(file)
    
col_dict

{'HHID': 'Case Identification',
 'HV000': 'Country code and phase',
 'HV001': 'Cluster number',
 'HV002': 'Household number',
 'HV003': "Respondent's line number (answering Household questionnaire)",
 'HV004': 'Ultimate area unit',
 'HV005': 'Household sample weight (6 decimals)',
 'HV006': 'Month of interview',
 'HV007': 'Year of interview',
 'HV008': 'Date of interview (CMC)',
 'HV008A': 'Date of interview Century Day Code (CDC)',
 'HV009': 'Number of household members',
 'HV010': 'Number of eligible women in household',
 'HV011': 'NA - Number of eligible men in household',
 'HV012': 'Number of de jure members',
 'HV013': 'Number of de facto members',
 'HV014': 'Number of children 5 and under (de jure)',
 'HV015': 'Result of household interview',
 'HV016': 'Day of interview',
 'HV017': 'Number of visits',
 'HV018': 'Interviewer identification',
 'HV019': 'NA - Keyer identification',
 'HV020': 'Ever-married sample',
 'HV021': 'Primary sampling unit',
 'HV022': 'Sample strata for sampl

In [4]:
df.shape

(27496, 2472)

In [5]:
df.columns

Index(['HHID', 'HV000', 'HV001', 'HV002', 'HV003', 'HV004', 'HV005', 'HV006',
       'HV007', 'HV008',
       ...
       'SH233X$1', 'SH233X$2', 'SH233X$3', 'SH233X$4', 'SH233X$5', 'SH233Z$1',
       'SH233Z$2', 'SH233Z$3', 'SH233Z$4', 'SH233Z$5'],
      dtype='object', length=2472)

In [6]:
df.head()

Unnamed: 0,HHID,HV000,HV001,HV002,HV003,HV004,HV005,HV006,HV007,HV008,HV008A,HV009,HV010,HV011,HV012,HV013,HV014,HV015,HV016,HV017,HV018,HV019,HV020,HV021,HV022,HV023,HV024,HV025,HV026,HV027,HV028,HV030,HV031,HV032,HV035,HV040,HV041,HV042,HV044,HV045A,HV045B,HV045C,HV046,HV801,HV802,HV803,HV804,HV807D,HV807M,HV807Y,...,SH232A$1,SH232A$2,SH232A$3,SH232A$4,SH232A$5,SH232B$1,SH232B$2,SH232B$3,SH232B$4,SH232B$5,SH233A$1,SH233A$2,SH233A$3,SH233A$4,SH233A$5,SH233B$1,SH233B$2,SH233B$3,SH233B$4,SH233B$5,SH233C$1,SH233C$2,SH233C$3,SH233C$4,SH233C$5,SH233D$1,SH233D$2,SH233D$3,SH233D$4,SH233D$5,SH233E$1,SH233E$2,SH233E$3,SH233E$4,SH233E$5,SH233F$1,SH233F$2,SH233F$3,SH233F$4,SH233F$5,SH233X$1,SH233X$2,SH233X$3,SH233X$4,SH233X$5,SH233Z$1,SH233Z$2,SH233Z$3,SH233Z$4,SH233Z$5
0,10001,PH7,1.0,1.0,1.0,1.0,364301.0,8.0,2017.0,1412.0,42961.0,8.0,2.0,,8.0,8.0,1.0,Completed,14.0,1.0,8300.0,,All woman sample,1.0,BASILAN,"BUCAY, ABRA",Autonomous Region in Muslim Mindanao,Rural,,Not selected,0.0,8300.0,,,,,,Not selected,Household selected,Tagalog,Tagalog,Yakan,No,1448.0,1527.0,39.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10002,PH7,1.0,2.0,4.0,1.0,364301.0,8.0,2017.0,1412.0,42964.0,5.0,2.0,,5.0,5.0,1.0,Completed,17.0,1.0,8303.0,,All woman sample,1.0,BASILAN,"BUCAY, ABRA",Autonomous Region in Muslim Mindanao,Rural,,Not selected,0.0,8300.0,,,,,,Not selected,Household selected,Tagalog,Tagalog,Ivatan,No,1017.0,1045.0,28.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10004,PH7,1.0,4.0,2.0,1.0,364301.0,8.0,2017.0,1412.0,42963.0,4.0,1.0,,4.0,4.0,0.0,Completed,16.0,1.0,8303.0,,All woman sample,1.0,BASILAN,"BUCAY, ABRA",Autonomous Region in Muslim Mindanao,Rural,,Not selected,0.0,8300.0,,,,,,Not selected,Household selected,Tagalog,Tagalog,Maranao,No,1455.0,1513.0,18.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10007,PH7,1.0,7.0,2.0,1.0,364301.0,8.0,2017.0,1412.0,42962.0,5.0,1.0,,5.0,5.0,2.0,Completed,15.0,1.0,8302.0,,All woman sample,1.0,BASILAN,"BUCAY, ABRA",Autonomous Region in Muslim Mindanao,Rural,,Not selected,0.0,8300.0,,,,,,Not selected,Household selected,Tagalog,Tagalog,Tausog,No,911.0,937.0,26.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,10008,PH7,1.0,8.0,1.0,1.0,364301.0,8.0,2017.0,1412.0,42961.0,11.0,0.0,,11.0,9.0,0.0,Completed,14.0,1.0,8303.0,,All woman sample,1.0,BASILAN,"BUCAY, ABRA",Autonomous Region in Muslim Mindanao,Rural,,Not selected,0.0,8300.0,,,,,,Not selected,Household selected,Tagalog,Tagalog,Tausog,Yes,1432.0,1510.0,38.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27496 entries, 0 to 27495
Columns: 2472 entries, HHID to SH233Z$5
dtypes: object(2472)
memory usage: 518.6+ MB


In [8]:
df.dropna(axis = 1, how = 'all', inplace = True)

In [9]:
df.shape

(27496, 1639)

In [10]:
df.isna().sum()

HHID              0
HV000             0
HV001             0
HV002             0
HV003             0
HV004             0
HV005             0
HV006             0
HV007             0
HV008             0
HV008A            0
HV009             0
HV010             0
HV012             0
HV013             0
HV014             0
HV015             0
HV016             0
HV017             0
HV018             0
HV020             0
HV021             0
HV022             0
HV023             0
HV024             0
HV025             0
HV027             0
HV028             0
HV030             0
HV042             0
HV044             0
HV045A            0
HV045B            0
HV045C            0
HV046             0
HV801             0
HV802             0
HV803             0
HV201             0
HV202         17874
HV201A         6589
HV204             0
HV205             0
HV206             0
HV207             0
HV208             0
HV209             0
HV210             0
HV211             0
HV212             0


In [11]:
df = df.loc[:,~df.columns.str.contains('\$')]

In [12]:
df.shape

(27496, 338)

In [13]:
# Inspect and change dtype of other cols as well
for col in ['HHID', 'HV001', 'HV002', 'HV003', 'HV004']:
    df[col] = df[col].astype('category')
    
df[['HHID', 'HV001', 'HV002', 'HV003', 'HV004']].dtypes

HHID     category
HV001    category
HV002    category
HV003    category
HV004    category
dtype: object

In [14]:
df[df.duplicated(keep=False)]

Unnamed: 0,HHID,HV000,HV001,HV002,HV003,HV004,HV005,HV006,HV007,HV008,HV008A,HV009,HV010,HV012,HV013,HV014,HV015,HV016,HV017,HV018,HV020,HV021,HV022,HV023,HV024,HV025,HV027,HV028,HV030,HV042,HV044,HV045A,HV045B,HV045C,HV046,HV801,HV802,HV803,HV201,HV202,HV201A,HV204,HV205,HV206,HV207,HV208,HV209,HV210,HV211,HV212,...,SH508X,SH509,SH510A,SH510B,SH510C,SH510D,SH510E,SH510F,SH510G,SH510H,SH510I,SH510J,SH510K,SH510L,SH510M,SH510N,SH510O,SH510X,SH510Z,SH511A,SH511B,SH511C,SH511D,SH511E,SH511F,SH511G,SH511H,SH511I,SH511J,SH511K,SH511L,SH511M,SH512H,SH512I,SH512J,SH511X,SH511Z,SH512A,SH512B,SH512C,SH512D,SH512E,SH512F,SH512G,SH512X,SH512Z,SH513,SH514,SH515,SHNUMDV


In [15]:
df.to_csv('data/PHHR71FL-labeled-clean.csv', index = False)