In [1]:
import pandas as pd
import numpy as np
import glob

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data to consolidate

In [2]:
w1 = ['06_04 - 12_04']
w2 = ['13_04 - 18_04']
w3 = ['19_04','20_04','21_04','22_04','23_04','24_04','25_04']
w4 = ['26_04','27_04','28_04','29_04','30_04','01_05','02_05']
w5 = ['03_05','04_05','05_05','06_05','07_05','08_05','09_05']
w1_w5 = [w1, w2, w3, w4, w5]
w1_w5 = [folder for sublist in w1_w5 for folder in sublist]

combined_folders = [w1, w2, w3, w4, w5, w1_w5]
folder_names = ['w1','w2','w3','w4','w5','w1_w5']
clean_data_files = []
issues_files = []

## Clean data

In [3]:
# Note: eligible_general_ration and received_free_ration only in week 1 data
cols = ["ID", "time_created", "time_modified", "surveyer", "number_id", "call_type", "call_connected",  
        "availability", "state", "district", "area_or_pincode", "area_name", "pincode", 
        "need_meds", "comments_health", "have_food", "where_to_get_food", "comments_food", 
        "eligible_mgnrega", "eligible_pm_kisan", "eligible_ujjwala", "eligible_jan_dhan", 
        "eligible_old_age_widow", "eligible_registered_labor", "eligible_bpl_ration", "eligible_apl_ration",
        "eligible_antyodaya_ration", 'eligible_general_ration', "eligible_none", 
        "received_mgnrega", "received_pm_kisan", "received_ujjwala", "received_jan_dhan", 
        "received_old_age_widow", "received_registered_labor", "received_free_ration_bpl_antyodaya", 
        "received_free_ration_apl", "received_free_ration_without_any", 'received_free_ration', 
        "received_not_eligible", "received_none", "received_dont_know", "comments_benefits", 
        "employment_status", "comments_additional", "need_to_revert", "follow_up_call_needs_food_support", 
        "follow_up_call_needs_health_support", "follow_up_call_needs_public_services_support", 
        "follow_up_call_issue_status", "follow_up_call_support_organization", "follow_up_call_comments", 
        "end_call_status", "end_call_status_category", "end_call_status_needs_urgent_support"]

for combined_folder in combined_folders:
    files = [glob.glob('data/'+ folder +'/clean_data*.csv') for folder in combined_folder]
    files = [file for sublist in files for file in sublist]
    clean_data = pd.DataFrame(columns = cols)
    for file in files:
        data_to_add = pd.read_csv(file)
        data_to_add = data_to_add.rename(columns={"needs_urgent_support": "end_call_status_needs_urgent_support"})
        clean_data = pd.concat([clean_data, data_to_add], ignore_index = True, sort=False)
    clean_data = clean_data[[col for col in cols if col in clean_data.columns]]
    clean_data_files.append(clean_data)
    folder_name = folder_names[combined_folders.index(combined_folder)]
    fname_clean_data = 'data/' + folder_name + '/clean_data_' + folder_name + '.csv'
    clean_data.to_csv(fname_clean_data, index=False)

## Issues

In [4]:
# Note: eligible_general_ration and received_free_ration only in week 1 data
cols = ['issue', "ID", "time_created", "time_modified", "surveyer", "number_id", "call_type", "call_connected",  
        "availability", "state", "district", "area_or_pincode", "area_name", "pincode", 
        "need_meds", "comments_health", "have_food", "where_to_get_food", "comments_food", 
        "eligible_mgnrega", "eligible_pm_kisan", "eligible_ujjwala", "eligible_jan_dhan", 
        "eligible_old_age_widow", "eligible_registered_labor", "eligible_bpl_ration", "eligible_apl_ration",
        "eligible_antyodaya_ration", 'eligible_general_ration', "eligible_none", 
        "received_mgnrega", "received_pm_kisan", "received_ujjwala", "received_jan_dhan", 
        "received_old_age_widow", "received_registered_labor", "received_free_ration_bpl_antyodaya", 
        "received_free_ration_apl", "received_free_ration_without_any", 'received_free_ration', 
        "received_not_eligible", "received_none", "received_dont_know", "comments_benefits", 
        "employment_status", "comments_additional", "need_to_revert", "follow_up_call_needs_food_support", 
        "follow_up_call_needs_health_support", "follow_up_call_needs_public_services_support", 
        "follow_up_call_issue_status", "follow_up_call_support_organization", "follow_up_call_comments", 
        "end_call_status", "end_call_status_category", "end_call_status_needs_urgent_support"]

for combined_folder in combined_folders:
    files = [glob.glob('data/'+ folder +'/issues*.csv') for folder in combined_folder]
    files = [file for sublist in files for file in sublist]
    issues = pd.DataFrame(columns = cols)
    for file in files:
        data_to_add = pd.read_csv(file)
        data_to_add = data_to_add.rename(columns={"needs_urgent_support": "end_call_status_needs_urgent_support"})
        issues = pd.concat([issues, data_to_add], ignore_index = True, sort=False)
    isses = issues[[col for col in cols if col in issues.columns]]
    issues_files.append(issues)
    folder_name = folder_names[combined_folders.index(combined_folder)]
    fname_issues = 'data/' + folder_name + '/issues_' + folder_name + '.csv'
    issues.to_csv(fname_issues, index=False)

## Data aggregation

In [5]:
def get_dashboard_row(df, agg_level, state, district, full=True):
    
    date = df['time_created'].max().strftime("%d/%m/%Y")
    num_days = len(df['time_created'].dt.strftime("%d/%m/%Y").unique())
    num_surveyed = len(df)
    
    num_unemployed = len(df[df['employment_status'] == 'unemployed'])
    perc_unemployed = num_unemployed / num_surveyed
    
    num_unemployed_post_lockdown = len(df[df['employment_status'] =='unemployedPostLockdown'])
    perc_unemployed_post_lockdown = num_unemployed_post_lockdown / num_surveyed
    
    num_eligible_jan_dhan = len(df[df['eligible_jan_dhan'] == True])
    perc_eligible_jan_dhan = num_eligible_jan_dhan / num_surveyed
    
    num_need_med = len(df[df['need_meds'] == True])
    perc_need_med = num_need_med / num_surveyed
    
    num_need_food = len(df[df['have_food'] == False])
    perc_need_food = num_need_food / num_surveyed

    perc_received_no_benefit = np.nan
    perc_received_mgnrega = np.nan
    perc_received_pm_kisan = np.nan
    perc_received_ujjwala = np.nan
    perc_received_jan_dhan = np.nan
    perc_received_old_age_widow_disability = np.nan
    perc_received_registered_labor = np.nan
    perc_received_free_ration = np.nan

    num_eligible_atleast_one = len(df[df['eligible_none'] == False])
    num_received_no_benefit = len(df[(df['received_none'] == True) & (df['eligible_none'] == False)])
    if (num_eligible_atleast_one > 0):
        perc_received_no_benefit = num_received_no_benefit / num_eligible_atleast_one

    num_eligible_mgnrega = len(df[df['eligible_mgnrega'] == True])
    num_received_mgnrega = len(df[df['received_mgnrega'] == True])
    if (num_eligible_mgnrega > 0):
        perc_received_mgnrega = num_received_mgnrega / num_eligible_mgnrega

    num_eligible_pm_kisan = len(df[df['eligible_pm_kisan'] == True])
    num_received_pm_kisan = len(df[df['received_pm_kisan'] == True])
    if (num_eligible_pm_kisan > 0):
        perc_received_pm_kisan = num_received_pm_kisan / num_eligible_pm_kisan

    num_eligible_ujjwala = len(df[df['eligible_ujjwala'] == True])
    num_received_ujjwala = len(df[df['received_ujjwala'] == True])
    if (num_eligible_ujjwala > 0):
        perc_received_ujjwala = num_received_ujjwala / num_eligible_ujjwala

    num_received_jan_dhan = len(df[df['received_jan_dhan'] == True])
    if (num_eligible_jan_dhan > 0):
        perc_received_jan_dhan = num_received_jan_dhan / num_eligible_jan_dhan

    num_eligible_old_age_widow_disability = len(df[df['eligible_old_age_widow'] == True])
    num_received_old_age_widow_disability = len(df[df['received_old_age_widow'] == True])
    if (num_eligible_old_age_widow_disability > 0):
        perc_received_old_age_widow_disability = num_received_old_age_widow_disability / num_eligible_old_age_widow_disability
    
    num_eligible_registered_labor = len(df[df['eligible_registered_labor'] == True])
    num_received_registered_labor = len(df[df['received_registered_labor'] == True])
    if (num_eligible_registered_labor > 0):
        perc_received_registered_labor = num_received_registered_labor / num_eligible_registered_labor
    
    # Note: eligible_general_ration and received_free_ration only in week 1 data
    num_eligible_free_ration = len(df[(df['eligible_bpl_ration'] == True) | 
                                      (df['eligible_apl_ration'] == True) |
                                      (df['eligible_antyodaya_ration'] == True) | 
                                      (df['eligible_general_ration'] == True)])
    num_received_free_ration = len(df[(df['received_free_ration_bpl_antyodaya'] == True) | 
                                      (df['received_free_ration_apl'] == True) |
                                      (df['received_free_ration'] == True)])
    if (num_eligible_free_ration > 0):
        perc_received_free_ration = num_received_free_ration / num_eligible_free_ration
        
    num_received_free_ration_irrespective_card = len(df[(df['received_free_ration_bpl_antyodaya'] == True) | 
                                                        (df['received_free_ration_apl'] == True) | 
                                                        (df['received_free_ration_without_any'] == True) | 
                                                        (df['received_free_ration'] == True)])
    perc_received_free_ration_irrespective_card = num_received_free_ration_irrespective_card / num_surveyed
    
    if full: 
        return [agg_level, state, district, date, num_days, num_surveyed, 
                num_unemployed, perc_unemployed, num_unemployed_post_lockdown, perc_unemployed_post_lockdown,  
                num_eligible_jan_dhan, perc_eligible_jan_dhan, num_need_med, perc_need_med, 
                num_need_food, perc_need_food, 
                num_eligible_atleast_one, num_received_no_benefit, perc_received_no_benefit, 
                num_eligible_mgnrega, num_received_mgnrega, perc_received_mgnrega, 
                num_eligible_pm_kisan, num_received_pm_kisan, perc_received_pm_kisan,
                num_eligible_ujjwala, num_received_ujjwala, perc_received_ujjwala,
                num_received_jan_dhan, perc_received_jan_dhan,
                num_eligible_old_age_widow_disability, num_received_old_age_widow_disability, 
                perc_received_old_age_widow_disability,
                num_eligible_registered_labor, num_received_registered_labor, perc_received_registered_labor,
                num_eligible_free_ration, num_received_free_ration, perc_received_free_ration,
                num_received_free_ration_irrespective_card, perc_received_free_ration_irrespective_card]
    
    return [agg_level, state, district, date, num_days, num_surveyed, perc_unemployed, perc_unemployed_post_lockdown,  
            perc_eligible_jan_dhan, perc_need_med, perc_need_food, perc_received_no_benefit, 
            perc_received_mgnrega, perc_received_pm_kisan, perc_received_ujjwala, perc_received_jan_dhan,
            perc_received_old_age_widow_disability, perc_received_registered_labor, perc_received_free_ration,
            perc_received_free_ration_irrespective_card]


## Exporting dashboard with both counts and percentages

In [6]:
column_names = ['agg_level', 'state', 'district', 'date', 'num_days', 'num_surveyed', 
                'num_unemployed', 'perc_unemployed', 'num_unemployed_post_lockdown', 'perc_unemployed_post_lockdown',
                'num_eligible_jan_dhan', 'perc_eligible_jan_dhan', 'num_need_med', 'perc_need_med', 
                'num_need_food', 'perc_need_food', 
                'num_eligible_atleast_one', 'num_received_no_benefit', 'perc_received_no_benefit', 
                'num_eligible_mgnrega', 'num_received_mgnrega', 'perc_received_mgnrega', 
                'num_eligible_pm_kisan', 'num_received_pm_kisan', 'perc_received_pm_kisan',
                'num_eligible_ujjwala', 'num_received_ujjwala', 'perc_received_ujjwala',
                'num_received_jan_dhan', 'perc_received_jan_dhan',
                'num_eligible_old_age_widow_disability', 'num_received_old_age_widow_disability', 
                'perc_received_old_age_widow_disability',
                'num_eligible_registered_labor', 'num_received_registered_labor', 'perc_received_registered_labor',
                'num_eligible_free_ration', 'num_received_free_ration', 'perc_received_free_ration',
                'num_received_free_ration_irrespective_card', 'perc_received_free_ration_irrespective_card']

In [7]:
for folder_name in folder_names:
    
    fname = 'data/' + folder_name + '/clean_data_' + folder_name + '.csv'
    survey = pd.read_csv(fname)
    survey['time_created'] = pd.to_datetime(survey['time_created'])
    
    full_dashboard = pd.DataFrame(columns = column_names)
    
    states = survey['state'].unique().tolist()
    states = [state for state in states if str(state) != 'nan']
    
    full_dashboard = full_dashboard.append(pd.Series(get_dashboard_row(survey, 'Country', np.nan, np.nan),
                                                     index=full_dashboard.columns), ignore_index=True)
    
    for state in states:
        state_df = survey[survey['state'] == state]
        full_dashboard = full_dashboard.append(pd.Series(get_dashboard_row(state_df, 'State', state, np.nan),
                                                         index=full_dashboard.columns), ignore_index=True)
        districts = state_df['district'].unique().tolist()
        districts = [district for district in districts if str(district) != 'nan']
        
        for district in districts:
            dist_df = state_df[state_df['district'] == district]
            full_dashboard = full_dashboard.append(pd.Series(get_dashboard_row(dist_df, 'District', state, district),
                                                             index=full_dashboard.columns), ignore_index=True)
    
    fname_full_dashboard = 'data/' + folder_name + '/full_dashboard_' + folder_name + '.csv'
    full_dashboard.to_csv(fname_full_dashboard, index=False)

## Exporting dashboard with percentages only

In [9]:
column_names = ['agg_level', 'state', 'district', 'date', 'num_days', 'num_surveyed', 'perc_unemployed', 
                'perc_unemployed_post_lockdown', 'perc_eligible_jan_dhan', 'perc_need_med', 'perc_need_food', 
                'perc_received_no_benefit', 'perc_received_mgnrega', 'perc_received_pm_kisan',
                'perc_received_ujjwala', 'perc_received_jan_dhan', 'perc_received_old_age_widow_disability',
                'perc_received_registered_labor', 'perc_received_free_ration', 
                'perc_received_free_ration_irrespective_card']

In [10]:
for folder_name in folder_names:
    
    fname = 'data/' + folder_name + '/clean_data_' + folder_name + '.csv'
    survey = pd.read_csv(fname)
    survey['time_created'] = pd.to_datetime(survey['time_created'])
    
    dashboard = pd.DataFrame(columns = column_names)
    
    states = survey['state'].unique().tolist()
    states = [state for state in states if str(state) != 'nan']
    
    dashboard = dashboard.append(pd.Series(get_dashboard_row(survey, 'Country', np.nan, np.nan, False),
                                           index=dashboard.columns), ignore_index=True)
    
    for state in states:
        state_df = survey[survey['state'] == state]
        dashboard = dashboard.append(pd.Series(get_dashboard_row(state_df, 'State', state, np.nan, False),
                                               index=dashboard.columns), ignore_index=True)
        districts = state_df['district'].unique().tolist()
        districts = [district for district in districts if str(district) != 'nan']
        
        for district in districts:
            dist_df = state_df[state_df['district'] == district]
            dashboard = dashboard.append(pd.Series(get_dashboard_row(dist_df, 'District', state, district, False),
                                                   index=dashboard.columns), ignore_index=True)
    
    fname_dashboard = 'data/' + folder_name + '/dashboard_' + folder_name + '.csv'
    dashboard.to_csv(fname_dashboard, index=False)

## Exporting qualitative responses

In [11]:
cols = ['ID', 'number_id', 'time_created', 'surveyer', 'state', 'district', 'comments_health', 
        'comments_food', 'comments_benefits', 'comments_additional', "follow_up_call_comments"]

In [12]:
for folder_name in folder_names:
    fname = 'data/' + folder_name + '/clean_data_' + folder_name + '.csv'
    survey = pd.read_csv(fname)
    fname_qualitative = 'data/' + folder_name + '/qualitative_' + folder_name + '.csv'
    survey.to_csv(fname_qualitative, columns=[col for col in cols if col in survey.columns], index=False)