In [32]:
import numpy as np
import pandas as pd
from datetime import timedelta
import Levenshtein
pd.options.mode.chained_assignment = None

## Problem statement
For each state, district and overall, find the following ratio: total number of persons vaccinated (both 1 and 2 doses) to total population. (If a district is absent in 2011 census, drop it from analysis.)
Output them in the following manner: districtid, vaccinateddose1ratio, vaccinateddose2ratio.

In [33]:
vacc_data1 = pd.read_csv('../data/cowin_vaccine_data_districtwise.csv', low_memory=False)
vacc_data1

Unnamed: 0,S No,State_Code,State,District_Key,Cowin Key,District,16/01/2021,16/01/2021.1,16/01/2021.2,16/01/2021.3,...,31/10/2021,31/10/2021.1,31/10/2021.2,31/10/2021.3,31/10/2021.4,31/10/2021.5,31/10/2021.6,31/10/2021.7,31/10/2021.8,31/10/2021.9
0,,,,,,,Total Individuals Registered,Sessions,Sites,First Dose Administered,...,Total Doses Administered,Sessions,Sites,First Dose Administered,Second Dose Administered,Male(Individuals Vaccinated),Female(Individuals Vaccinated),Transgender(Individuals Vaccinated),Covaxin (Doses Administered),CoviShield (Doses Administered)
1,1.0,AN,Andaman and Nicobar Islands,AN_Nicobars,Nicobar,Nicobars,745,0,0,0,...,,,,,,,,,,
2,2.0,AN,Andaman and Nicobar Islands,AN_North and Middle Andaman,North and Middle Andaman,North and Middle Andaman,1496,0,0,0,...,,,,,,,,,,
3,3.0,AN,Andaman and Nicobar Islands,AN_South Andaman,South Andaman,South Andaman,6028,2,2,23,...,,,,,,,,,,
4,4.0,AP,Andhra Pradesh,AP_Anantapur,Anantapur,Anantapur,20781,28,26,287,...,,,,,,,,,,
5,5.0,AP,Andhra Pradesh,AP_Chittoor,Chittoor,Chittoor,26285,63,31,424,...,,,,,,,,,,
6,6.0,AP,Andhra Pradesh,AP_East Godavari,East Godavari,East Godavari,23819,39,37,1012,...,,,,,,,,,,
7,7.0,AP,Andhra Pradesh,AP_Guntur,Guntur,Guntur,25859,31,31,303,...,,,,,,,,,,
8,8.0,AP,Andhra Pradesh,AP_Krishna,Krishna,Krishna,31419,42,31,144,...,,,,,,,,,,
9,9.0,AP,Andhra Pradesh,AP_Kurnool,Kurnool,Kurnool,18219,54,29,214,...,,,,,,,,,,


In [34]:
vacc_data1.iloc[0]

S No                                            NaN
State_Code                                      NaN
State                                           NaN
District_Key                                    NaN
Cowin Key                                       NaN
District                                        NaN
16/01/2021             Total Individuals Registered
16/01/2021.1                               Sessions
16/01/2021.2                                 Sites 
16/01/2021.3                First Dose Administered
16/01/2021.4               Second Dose Administered
16/01/2021.5           Male(Individuals Vaccinated)
16/01/2021.6         Female(Individuals Vaccinated)
16/01/2021.7    Transgender(Individuals Vaccinated)
16/01/2021.8           Covaxin (Doses Administered)
16/01/2021.9        CoviShield (Doses Administered)
17/01/2021             Total Individuals Registered
17/01/2021.1                               Sessions
17/01/2021.2                                 Sites 
17/01/2021.3

In [35]:
vacc_data = vacc_data1.iloc[1:]
vacc_data.fillna(0, inplace=True)
col = list(vacc_data.columns[6:])
vacc_data[col] = vacc_data[col].astype(int)
vacc_data

Unnamed: 0,S No,State_Code,State,District_Key,Cowin Key,District,16/01/2021,16/01/2021.1,16/01/2021.2,16/01/2021.3,...,31/10/2021,31/10/2021.1,31/10/2021.2,31/10/2021.3,31/10/2021.4,31/10/2021.5,31/10/2021.6,31/10/2021.7,31/10/2021.8,31/10/2021.9
1,1.0,AN,Andaman and Nicobar Islands,AN_Nicobars,Nicobar,Nicobars,745,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,AN,Andaman and Nicobar Islands,AN_North and Middle Andaman,North and Middle Andaman,North and Middle Andaman,1496,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,AN,Andaman and Nicobar Islands,AN_South Andaman,South Andaman,South Andaman,6028,2,2,23,...,0,0,0,0,0,0,0,0,0,0
4,4.0,AP,Andhra Pradesh,AP_Anantapur,Anantapur,Anantapur,20781,28,26,287,...,0,0,0,0,0,0,0,0,0,0
5,5.0,AP,Andhra Pradesh,AP_Chittoor,Chittoor,Chittoor,26285,63,31,424,...,0,0,0,0,0,0,0,0,0,0
6,6.0,AP,Andhra Pradesh,AP_East Godavari,East Godavari,East Godavari,23819,39,37,1012,...,0,0,0,0,0,0,0,0,0,0
7,7.0,AP,Andhra Pradesh,AP_Guntur,Guntur,Guntur,25859,31,31,303,...,0,0,0,0,0,0,0,0,0,0
8,8.0,AP,Andhra Pradesh,AP_Krishna,Krishna,Krishna,31419,42,31,144,...,0,0,0,0,0,0,0,0,0,0
9,9.0,AP,Andhra Pradesh,AP_Kurnool,Kurnool,Kurnool,18219,54,29,214,...,0,0,0,0,0,0,0,0,0,0
10,10.0,AP,Andhra Pradesh,AP_Prakasam,Prakasam,Prakasam,13015,17,16,202,...,0,0,0,0,0,0,0,0,0,0


In [36]:
distr_data = pd.read_csv('../output/edge-graph.csv', index_col='District_Key')
distr_data.drop(columns=['Unnamed: 0'], inplace=True)
districts_key = list(distr_data.index)
districts_key

['AP_Anantapur',
 'AP_Chittoor',
 'AP_East Godavari',
 'AP_Guntur',
 'AP_Krishna',
 'AP_Kurnool',
 'AP_Prakasam',
 'AP_S.P.S. Nellore',
 'AP_Srikakulam',
 'AP_Visakhapatnam',
 'AP_Vizianagaram',
 'AP_West Godavari',
 'AP_Y.S.R. Kadapa',
 'AR_Anjaw',
 'AR_Changlang',
 'AR_East Kameng',
 'AR_East Siang',
 'AR_Kamle',
 'AR_Kra Daadi',
 'AR_Kurung Kumey',
 'AR_Lepa Rada',
 'AR_Lohit',
 'AR_Longding',
 'AR_Lower Dibang Valley',
 'AR_Lower Siang',
 'AR_Lower Subansiri',
 'AR_Namsai',
 'AR_Pakke Kessang',
 'AR_Papum Pare',
 'AR_Shi Yomi',
 'AR_Siang',
 'AR_Tawang',
 'AR_Tirap',
 'AR_Upper Dibang Valley',
 'AR_Upper Siang',
 'AR_Upper Subansiri',
 'AR_West Kameng',
 'AR_West Siang',
 'AS_Baksa',
 'AS_Barpeta',
 'AS_Biswanath',
 'AS_Bongaigaon',
 'AS_Cachar',
 'AS_Charaideo',
 'AS_Chirang',
 'AS_Darrang',
 'AS_Dhemaji',
 'AS_Dhubri',
 'AS_Dibrugarh',
 'AS_Dima Hasao',
 'AS_Goalpara',
 'AS_Golaghat',
 'AS_Hailakandi',
 'AS_Hojai',
 'AS_Jorhat',
 'AS_Kamrup',
 'AS_Kamrup Metropolitan',
 'AS_Karbi

In [37]:
map_dkey_sname, map_dkey_dname = {}, {}
for dt in districts_key:
    map_dkey_sname[dt] = vacc_data[vacc_data['District_Key'] == dt]['State'].iloc[0]
    map_dkey_dname[dt] = vacc_data[vacc_data['District_Key'] == dt]['District'].iloc[0]
map_dkey_dname

{'AP_Anantapur': 'Anantapur',
 'AP_Chittoor': 'Chittoor',
 'AP_East Godavari': 'East Godavari',
 'AP_Guntur': 'Guntur',
 'AP_Krishna': 'Krishna',
 'AP_Kurnool': 'Kurnool',
 'AP_Prakasam': 'Prakasam',
 'AP_S.P.S. Nellore': 'S.P.S. Nellore',
 'AP_Srikakulam': 'Srikakulam',
 'AP_Visakhapatnam': 'Visakhapatnam',
 'AP_Vizianagaram': 'Vizianagaram',
 'AP_West Godavari': 'West Godavari',
 'AP_Y.S.R. Kadapa': 'Y.S.R. Kadapa',
 'AR_Anjaw': 'Anjaw',
 'AR_Changlang': 'Changlang',
 'AR_East Kameng': 'East Kameng',
 'AR_East Siang': 'East Siang',
 'AR_Kamle': 'Kamle',
 'AR_Kra Daadi': 'Kra Daadi',
 'AR_Kurung Kumey': 'Kurung Kumey',
 'AR_Lepa Rada': 'Lepa Rada',
 'AR_Lohit': 'Lohit',
 'AR_Longding': 'Longding',
 'AR_Lower Dibang Valley': 'Lower Dibang Valley',
 'AR_Lower Siang': 'Lower Siang',
 'AR_Lower Subansiri': 'Lower Subansiri',
 'AR_Namsai': 'Namsai',
 'AR_Pakke Kessang': 'Pakke Kessang',
 'AR_Papum Pare': 'Papum Pare',
 'AR_Shi Yomi': 'Shi Yomi',
 'AR_Siang': 'Siang',
 'AR_Tawang': 'Tawang'

In [38]:
pop_data = pd.read_excel('../data/DDW_PCA0000_2011_Indiastatedist.xlsx')
pop_data = pop_data[['Name', 'Level', 'TRU', 'TOT_P']]
pop_data = pop_data[pop_data['TRU'] == 'Total']
pop_data.drop(columns=['TRU'], inplace=True)
pop_data

Unnamed: 0,Name,Level,TOT_P
0,India,India,1210854977
3,JAMMU & KASHMIR,STATE,12541302
6,Kupwara,DISTRICT,870354
9,Badgam,DISTRICT,753745
12,Leh(Ladakh),DISTRICT,133487
15,Kargil,DISTRICT,140802
18,Punch,DISTRICT,476835
21,Rajouri,DISTRICT,642415
24,Kathua,DISTRICT,616435
27,Baramula,DISTRICT,1008039


In [39]:
overall_pop_data = pop_data[pop_data['Level'] == 'India']
overall_pop_data

Unnamed: 0,Name,Level,TOT_P
0,India,India,1210854977


In [40]:
vacc_data[vacc_data['District_Key'] == 'AP_Anantapur'].iloc[:, :100]

Unnamed: 0,S No,State_Code,State,District_Key,Cowin Key,District,16/01/2021,16/01/2021.1,16/01/2021.2,16/01/2021.3,...,24/01/2021.4,24/01/2021.5,24/01/2021.6,24/01/2021.7,24/01/2021.8,24/01/2021.9,25/01/2021,25/01/2021.1,25/01/2021.2,25/01/2021.3
4,4.0,AP,Andhra Pradesh,AP_Anantapur,Anantapur,Anantapur,20781,28,26,287,...,0,535,3861,0,0,4396,25439,529,111,4923


In [41]:
# This function will find the most similar district word from the given district word in the dataset.
def get_most_similar_word(df, w, name):
    '''
    inputs:
        df : input data containing all the state and districts code i.e DataFrame format
        w : district name
    output:
        output1 : most similar district
        output2 : similarity score between two given word input and most similar word
        
    '''
    temp_df = df.copy()
    temp_df = temp_df[1:]
    temp_df['Levenshtein_dist'] = temp_df[name].apply(lambda x: Levenshtein.distance(w, x))
    temp_df = temp_df.sort_values(by=['Levenshtein_dist'])
#     print(temp_df['District'].iloc[0])
    #print("with value: ", temp_df['Levenshtein_dist'].iloc[0])
    return temp_df[name].iloc[0], temp_df['Levenshtein_dist'].iloc[0]

In [42]:
def find_correct_match(distr):
    '''
    inputs:
        distr : district name for finding correct match if  Levenshtein_distance doesn't work
    output:
        output1 : correct match if available otherwise return empty string
        
    '''
    correct_match = {'Belgaum': 'Belagavi', 'Leh(Ladakh)':'Leh', 'Gurgaon':'Gurugram', 'Baleshwar': 'Balasore', 'Rangareddy': 'Ranga Reddy', 'Hugli':'Hooghly', 'Sahibzada Ajit Singh Nagar': 'S.A.S. Nagar', 
                 'Sant Ravidas Nagar':'Bhadohi','Faizabad':'Ayodhya', 'Muktsar': 'Sri Muktsar Sahib', 'Pashchim Champaran':'West Champaran','Pashchimi Singhbhum': 'West Singhbhum',
                'Bid': 'Beed', 'East Karbi Anglong':'Karbi Anglong', 'The Dangs':'Dang', 'Dantewada':'Dakshin Bastar Dantewada','Purbi Singhbhum':'East Singhbhum', 'Warangal':'Warangal Urban',
                'Jyotiba Phule Nagar': 'Amroha', 'Allahabad ':'Prayagraj', 'Khandwa (East Nimar)':'Khandwa', 'Dohad':'Dahod', 'Kachchh':'Kutch', 'Kheri':'Lakhimpur Kheri', 'Kaimur (Bhabua)':'Kaimur', 
                'Y.S.R.':'Y.S.R. Kadapa', "Sant Ravidas Nagar (Bhadohi)": "Bhadohi", 'Gulbarga':'Kalaburagi', 'Shimoga':'Shivamogga', 'Bangalore':'Bengaluru Urban'}
    if distr in correct_match.keys():
        return correct_match[distr]
    return ''

In [43]:
def not_included_district(dt):
    not_included = ['Mewat ', 'North West', 'North', 'North East', 'East', 
                    'Central', 'West', 'South West', 'South',
                    'Kanshiram Nagar', 'North  District','West District', 'Barddhaman ',
                    'South District', 'East District', 'Mumbai Suburban', 'Mahamaya Nagar']
    
    if dt not in not_included: return True
    return False

In [44]:
df = pop_data.copy()
df = df[df['Level'] == 'DISTRICT']
map_dt_cor_dt={}
map_dname_dkey={}
for dt in df['Name'].unique():
    if not not_included_district(dt): 
        print("Not including this district: ", dt)
        continue
    if dt not in map_dkey_dname.values():
        sm_dt, score = get_most_similar_word(vacc_data, dt, 'District')
        cor_match_dt = find_correct_match(dt)
        if len(cor_match_dt)>0:
            sm_dt = cor_match_dt
            map_dt_cor_dt[dt] = sm_dt
            map_dname_dkey[dt] = vacc_data[vacc_data['District'] == sm_dt]['District_Key'].iloc[0]
            print("District available: ", dt)
            print("Correct Matched found with: ", cor_match_dt)
        elif sm_dt in map_dkey_dname.values():
            print("District available: ", dt)
            print("Correct Matched with: ", sm_dt)
            map_dt_cor_dt[dt] = sm_dt
            map_dname_dkey[dt] = vacc_data[vacc_data['District'] == sm_dt]['District_Key'].iloc[0]
    else:
        map_dt_cor_dt[dt] = dt
        map_dname_dkey[dt] = vacc_data[vacc_data['District'] == dt]['District_Key'].iloc[0]

df['District'] = df['Name'].apply(lambda x: map_dt_cor_dt[x] if x in map_dt_cor_dt.keys() else np.nan)
df.dropna(subset=['District'], inplace=True)
df['District_Key'] = df['Name'].apply(lambda x: map_dname_dkey[x])
df.drop(columns=['Name', 'District', 'Level'], inplace=True)
df = df[['District_Key', 'TOT_P']]

District available:  Badgam
Correct Matched with:  Budgam
District available:  Leh(Ladakh)
Correct Matched found with:  Leh
District available:  Baramula
Correct Matched with:  Baramulla
District available:  Bandipore
Correct Matched with:  Bandipora
District available:  Shupiyan
Correct Matched with:  Shopiyan
District available:  Lahul & Spiti
Correct Matched with:  Lahaul and Spiti
District available:  Kapurthala 
Correct Matched with:  Kapurthala
District available:  Shahid Bhagat Singh Nagar 
Correct Matched with:  Shahid Bhagat Singh Nagar
District available:  Firozpur
Correct Matched with:  Ferozepur
District available:  Muktsar
Correct Matched found with:  Sri Muktsar Sahib
District available:  Amritsar 
Correct Matched with:  Amritsar
District available:  Sahibzada Ajit Singh Nagar
Correct Matched found with:  S.A.S. Nagar
District available:  Garhwal
Correct Matched with:  Garhwa
District available:  Almora

Correct Matched with:  Almora
District available:  Hardwar
Correct M

District available:  Gulbarga
Correct Matched found with:  Kalaburagi
District available:  Bangalore Rural
Correct Matched with:  Bengaluru Rural
District available:  Idukki 
Correct Matched with:  Idukki
District available:  Namakkal   
Correct Matched with:  Namakkal
District available:  The Nilgiris
Correct Matched with:  Nilgiris
District available:  Karur 
Correct Matched with:  Karur
District available:  Perambalur  
Correct Matched with:  Perambalur
District available:  Ariyalur  
Correct Matched with:  Ariyalur
District available:  Nagapattinam  
Correct Matched with:  Nagapattinam
District available:  Theni  
Correct Matched with:  Theni
District available:  Tirunelveli 
Correct Matched with:  Tirunelveli
District available:  Kanniyakumari
Correct Matched with:  Kanyakumari
District available:  Nicobars
Correct Matched with:  Nalbari


In [45]:
sdf = pop_data.copy()
sdf = sdf[sdf['Level'] == 'STATE']
is_available = {'DAMAN & DIU':'Dadra and Nagar Haveli and Daman and Diu', 
                'DADRA & NAGAR HAVELI': 'Dadra and Nagar Haveli and Daman and Diu'}
for st in sdf['Name'].unique():
    if st in is_available.keys():
        sdf.loc[sdf['Name'] == st, 'State_Code'] = vacc_data[vacc_data['State'] == is_available[st]]['State_Code'].iloc[0]
    else:
        sm_st, _ = get_most_similar_word(vacc_data, st.title(), 'State')
        sdf.loc[sdf['Name'] == st, 'State_Code'] = vacc_data[vacc_data['State'] == sm_st]['State_Code'].iloc[0]
sdf.drop(columns=['Name', 'Level'], inplace=True)
sdf = sdf.groupby('State_Code').sum()
sdf

Unnamed: 0_level_0,TOT_P
State_Code,Unnamed: 1_level_1
AN,380581
AP,84580777
AR,1383727
AS,31205576
BR,104099452
CH,1055450
CT,25545198
DL,16787941
DN,586956
GA,1458545


In [46]:
distr_wise = pd.merge(vacc_data, df, how ='inner', on =['District_Key'])
distr_wise.fillna(0, inplace=True)
distr_wise

Unnamed: 0,S No,State_Code,State,District_Key,Cowin Key,District,16/01/2021,16/01/2021.1,16/01/2021.2,16/01/2021.3,...,31/10/2021.1,31/10/2021.2,31/10/2021.3,31/10/2021.4,31/10/2021.5,31/10/2021.6,31/10/2021.7,31/10/2021.8,31/10/2021.9,TOT_P
0,4.0,AP,Andhra Pradesh,AP_Anantapur,Anantapur,Anantapur,20781,28,26,287,...,0,0,0,0,0,0,0,0,0,4081148
1,5.0,AP,Andhra Pradesh,AP_Chittoor,Chittoor,Chittoor,26285,63,31,424,...,0,0,0,0,0,0,0,0,0,4174064
2,6.0,AP,Andhra Pradesh,AP_East Godavari,East Godavari,East Godavari,23819,39,37,1012,...,0,0,0,0,0,0,0,0,0,5154296
3,7.0,AP,Andhra Pradesh,AP_Guntur,Guntur,Guntur,25859,31,31,303,...,0,0,0,0,0,0,0,0,0,4887813
4,8.0,AP,Andhra Pradesh,AP_Krishna,Krishna,Krishna,31419,42,31,144,...,0,0,0,0,0,0,0,0,0,4517398
5,9.0,AP,Andhra Pradesh,AP_Kurnool,Kurnool,Kurnool,18219,54,29,214,...,0,0,0,0,0,0,0,0,0,4053463
6,10.0,AP,Andhra Pradesh,AP_Prakasam,Prakasam,Prakasam,13015,17,16,202,...,0,0,0,0,0,0,0,0,0,3397448
7,11.0,AP,Andhra Pradesh,AP_S.P.S. Nellore,Sri Potti Sriramulu Nellore,S.P.S. Nellore,16944,64,26,277,...,0,0,0,0,0,0,0,0,0,2963557
8,12.0,AP,Andhra Pradesh,AP_Srikakulam,Srikakulam,Srikakulam,16335,35,18,165,...,0,0,0,0,0,0,0,0,0,2703114
9,13.0,AP,Andhra Pradesh,AP_Visakhapatnam,Visakhapatnam,Visakhapatnam,41378,64,35,527,...,0,0,0,0,0,0,0,0,0,4290589


In [47]:
state_wise_pop_data = sdf.copy()
state_wise_pop_data = state_wise_pop_data.reset_index()
state_wise_pop_data

Unnamed: 0,State_Code,TOT_P
0,AN,380581
1,AP,84580777
2,AR,1383727
3,AS,31205576
4,BR,104099452
5,CH,1055450
6,CT,25545198
7,DL,16787941
8,DN,586956
9,GA,1458545


In [48]:
ddf = distr_wise.copy()
start_date = pd.to_datetime('16/01/2021', format='%d/%m/%Y').date()
end_date = pd.to_datetime('14/08/2021', format='%d/%m/%Y').date()
vacc_col = []
get_original_date=[]
while start_date<=end_date:
    start_date = start_date.strftime('%d/%m/%Y')
    dose1 = str(start_date)+'.3'
    dose2 = str(start_date)+'.4'
    
    #Convert back to datetime format
    start_date = start_date.split('.')[0]
    start_date = pd.to_datetime(start_date, format='%d/%m/%Y').date()
    
    if dose1 not in ddf.columns or dose2 not in ddf.columns:
        print("Date not available: ", start_date)
        continue
    ddf[dose1] = ddf[dose1].astype(int)
    ddf[dose2] = ddf[dose2].astype(int)
    get_original_date.append(start_date.strftime('%d/%m/%Y'))
    vacc_col.append(start_date.strftime('%d/%m/%Y')+'_1')
    vacc_col.append(start_date.strftime('%d/%m/%Y')+'_2')
    ddf[start_date.strftime('%d/%m/%Y')+'_1'] = ddf[dose1] 
    ddf[start_date.strftime('%d/%m/%Y')+'_2'] = ddf[dose2] 
    start_date += timedelta(days=1)

In [49]:
ddf

Unnamed: 0,S No,State_Code,State,District_Key,Cowin Key,District,16/01/2021,16/01/2021.1,16/01/2021.2,16/01/2021.3,...,10/08/2021_1,10/08/2021_2,11/08/2021_1,11/08/2021_2,12/08/2021_1,12/08/2021_2,13/08/2021_1,13/08/2021_2,14/08/2021_1,14/08/2021_2
0,4.0,AP,Andhra Pradesh,AP_Anantapur,Anantapur,Anantapur,20781,28,26,287,...,1292062,563906,1298043,569328,1299214,571164,1357367,594250,1364374,597346
1,5.0,AP,Andhra Pradesh,AP_Chittoor,Chittoor,Chittoor,26285,63,31,424,...,1463591,640538,1464471,640668,1464566,640753,1544219,640773,1565591,640864
2,6.0,AP,Andhra Pradesh,AP_East Godavari,East Godavari,East Godavari,23819,39,37,1012,...,1879353,544608,1881360,545978,1881778,546816,1924155,551251,1927490,552473
3,7.0,AP,Andhra Pradesh,AP_Guntur,Guntur,Guntur,25859,31,31,303,...,1692429,564197,1705191,569098,1709624,571534,1747460,587644,1758535,592441
4,8.0,AP,Andhra Pradesh,AP_Krishna,Krishna,Krishna,31419,42,31,144,...,1707961,542621,1713851,545372,1721065,547797,1737195,552836,1756261,557589
5,9.0,AP,Andhra Pradesh,AP_Kurnool,Kurnool,Kurnool,18219,54,29,214,...,1182041,476527,1187902,478899,1189983,479770,1220728,487236,1242692,492628
6,10.0,AP,Andhra Pradesh,AP_Prakasam,Prakasam,Prakasam,13015,17,16,202,...,1194897,497477,1207963,504569,1211101,507244,1220659,510952,1231912,515791
7,11.0,AP,Andhra Pradesh,AP_S.P.S. Nellore,Sri Potti Sriramulu Nellore,S.P.S. Nellore,16944,64,26,277,...,1122921,517716,1125571,518370,1142085,523810,1146908,525613,1158076,528689
8,12.0,AP,Andhra Pradesh,AP_Srikakulam,Srikakulam,Srikakulam,16335,35,18,165,...,1018985,330517,1023127,333494,1024263,334773,1026409,336118,1028569,336972
9,13.0,AP,Andhra Pradesh,AP_Visakhapatnam,Visakhapatnam,Visakhapatnam,41378,64,35,527,...,1758550,590250,1784398,602216,1787851,603149,1851746,616498,1875574,623159


In [50]:
dkey_skey_col = ['State_Code', 'District_Key', 'TOT_P']
final_col = dkey_skey_col + vacc_col

final_df = ddf[final_col]
final_df.sort_values('District_Key', inplace=True)
final_df

Unnamed: 0,State_Code,District_Key,TOT_P,16/01/2021_1,16/01/2021_2,17/01/2021_1,17/01/2021_2,18/01/2021_1,18/01/2021_2,19/01/2021_1,...,10/08/2021_1,10/08/2021_2,11/08/2021_1,11/08/2021_2,12/08/2021_1,12/08/2021_2,13/08/2021_1,13/08/2021_2,14/08/2021_1,14/08/2021_2
0,AP,AP_Anantapur,4081148,287,0,504,0,778,0,1324,...,1292062,563906,1298043,569328,1299214,571164,1357367,594250,1364374,597346
1,AP,AP_Chittoor,4174064,424,0,1012,0,925,0,1798,...,1463591,640538,1464471,640668,1464566,640753,1544219,640773,1565591,640864
2,AP,AP_East Godavari,5154296,1012,0,2059,0,3126,0,9043,...,1879353,544608,1881360,545978,1881778,546816,1924155,551251,1927490,552473
3,AP,AP_Guntur,4887813,303,0,472,0,647,0,1176,...,1692429,564197,1705191,569098,1709624,571534,1747460,587644,1758535,592441
4,AP,AP_Krishna,4517398,144,0,229,0,319,0,490,...,1707961,542621,1713851,545372,1721065,547797,1737195,552836,1756261,557589
5,AP,AP_Kurnool,4053463,214,0,631,0,668,0,1184,...,1182041,476527,1187902,478899,1189983,479770,1220728,487236,1242692,492628
6,AP,AP_Prakasam,3397448,202,0,868,0,1295,0,1948,...,1194897,497477,1207963,504569,1211101,507244,1220659,510952,1231912,515791
7,AP,AP_S.P.S. Nellore,2963557,277,0,727,0,998,0,1907,...,1122921,517716,1125571,518370,1142085,523810,1146908,525613,1158076,528689
8,AP,AP_Srikakulam,2703114,165,0,404,0,529,0,1061,...,1018985,330517,1023127,333494,1024263,334773,1026409,336118,1028569,336972
9,AP,AP_Visakhapatnam,4290589,527,0,944,0,1328,0,2019,...,1758550,590250,1784398,602216,1787851,603149,1851746,616498,1875574,623159


In [51]:
def get_districtwise_df(final_df, dt):
    ddf = final_df.copy()
    ddf = ddf[ddf['District_Key'] == dt]

    d1 = [c for c in ddf.columns[3::2]]
    d2 = [c for c in ddf.columns[4::2]]
    dose1_df = ddf.loc[:, d1]
    dose2_df = ddf.loc[:, d2]
    
    return dose1_df.iloc[0][-1], dose2_df.iloc[0][-1]

def get_statewise_df(state_final_df, st):
    ddf = state_final_df.copy()
    ddf = ddf[ddf['State_Code'] == st]
    ddf = ddf.groupby('State_Code').sum()
    

    d1 = [c for c in ddf.columns[1::2]]
    d2 = [c for c in ddf.columns[2::2]]
    dose1_df = ddf.loc[:, d1]
    dose2_df = ddf.loc[:, d2]

    return dose1_df.iloc[0][-1], dose2_df.iloc[0][-1]


In [52]:
dt_df = final_df.copy()
for dt in dt_df['District_Key'].unique():
    print(dt)
    d1, d2 = get_districtwise_df(final_df, dt)
    dt_df.loc[dt_df['District_Key'] == dt, 'dose1'] = d1
    dt_df.loc[dt_df['District_Key'] == dt, 'dose2'] = d2

AP_Anantapur
AP_Chittoor
AP_East Godavari
AP_Guntur
AP_Krishna
AP_Kurnool
AP_Prakasam
AP_S.P.S. Nellore
AP_Srikakulam
AP_Visakhapatnam
AP_Vizianagaram
AP_West Godavari
AP_Y.S.R. Kadapa
AR_Anjaw
AR_Changlang
AR_East Kameng
AR_East Siang
AR_Kurung Kumey
AR_Lohit
AR_Lower Dibang Valley
AR_Lower Subansiri
AR_Papum Pare
AR_Tawang
AR_Tirap
AR_Upper Dibang Valley
AR_Upper Siang
AR_Upper Subansiri
AR_West Kameng
AR_West Siang
AS_Baksa
AS_Barpeta
AS_Bongaigaon
AS_Cachar
AS_Chirang
AS_Darrang
AS_Dhemaji
AS_Dhubri
AS_Dibrugarh
AS_Dima Hasao
AS_Goalpara
AS_Golaghat
AS_Hailakandi
AS_Hojai
AS_Jorhat
AS_Kamrup
AS_Kamrup Metropolitan
AS_Karbi Anglong
AS_Karimganj
AS_Kokrajhar
AS_Lakhimpur
AS_Morigaon
AS_Nagaon
AS_Nalbari
AS_Sivasagar
AS_Sonitpur
AS_Tinsukia
AS_Udalguri
BR_Araria
BR_Arwal
BR_Aurangabad
BR_Banka
BR_Begusarai
BR_Bhagalpur
BR_Bhojpur
BR_Buxar
BR_Darbhanga
BR_Gaya
BR_Gopalganj
BR_Jamui
BR_Jehanabad
BR_Kaimur
BR_Katihar
BR_Khagaria
BR_Kishanganj
BR_Lakhisarai
BR_Madhepura
BR_Madhubani
BR_Mu

In [53]:
state_final_df = final_df.copy()
for st in state_wise_pop_data['State_Code'].unique():
    if st not in state_final_df['State_Code'].unique(): continue
    d1, d2 = get_statewise_df(state_final_df, st)
    state_wise_pop_data.loc[state_wise_pop_data['State_Code'] == st, 'dose1'] = d1
    state_wise_pop_data.loc[state_wise_pop_data['State_Code'] == st, 'dose2'] = d2
state_wise_pop_data = state_wise_pop_data.dropna()
state_wise_pop_data

Unnamed: 0,State_Code,TOT_P,dose1,dose2
1,AP,84580777,18516141.0,6485212.0
2,AR,1383727,556214.0,161506.0
3,AS,31205576,10831255.0,2361667.0
4,BR,104099452,25337996.0,4821328.0
5,CH,1055450,733579.0,244351.0
6,CT,25545198,9096628.0,2805500.0
7,DL,16787941,648056.0,320117.0
8,DN,586956,598380.0,90421.0
9,GA,1458545,1112815.0,329909.0
10,GJ,60439692,26446345.0,8263804.0


In [54]:
new_df = dt_df.copy()
new_df = new_df[['District_Key', 'dose1', 'dose2', 'TOT_P']]
new_df

Unnamed: 0,District_Key,dose1,dose2,TOT_P
0,AP_Anantapur,1364374.0,597346.0,4081148
1,AP_Chittoor,1565591.0,640864.0,4174064
2,AP_East Godavari,1927490.0,552473.0,5154296
3,AP_Guntur,1758535.0,592441.0,4887813
4,AP_Krishna,1756261.0,557589.0,4517398
5,AP_Kurnool,1242692.0,492628.0,4053463
6,AP_Prakasam,1231912.0,515791.0,3397448
7,AP_S.P.S. Nellore,1158076.0,528689.0,2963557
8,AP_Srikakulam,1028569.0,336972.0,2703114
9,AP_Visakhapatnam,1875574.0,623159.0,4290589


In [55]:
st_df = state_wise_pop_data.copy()
st_df = st_df[['State_Code', 'dose1', 'dose2', 'TOT_P']]
st_df

Unnamed: 0,State_Code,dose1,dose2,TOT_P
1,AP,18516141.0,6485212.0,84580777
2,AR,556214.0,161506.0,1383727
3,AS,10831255.0,2361667.0,31205576
4,BR,25337996.0,4821328.0,104099452
5,CH,733579.0,244351.0,1055450
6,CT,9096628.0,2805500.0,25545198
7,DL,648056.0,320117.0,16787941
8,DN,598380.0,90421.0,586956
9,GA,1112815.0,329909.0,1458545
10,GJ,26446345.0,8263804.0,60439692


In [56]:
overall_pop_data['dose1'] = st_df['dose1'].sum()
overall_pop_data['dose2'] = st_df['dose2'].sum()
overall_pop_data

Unnamed: 0,Name,Level,TOT_P,dose1,dose2
0,India,India,1210854977,375349862.0,104814265.0


In [57]:
new_df['vaccinateddose1ratio'] = new_df['dose1']/new_df['TOT_P']
new_df['vaccinateddose2ratio'] = new_df['dose2']/new_df['TOT_P']
new_df = new_df[['District_Key', 'vaccinateddose1ratio', 'vaccinateddose2ratio']]
new_df.rename(columns={'District_Key':'districtid'}, inplace=True)
new_df = new_df.sort_values('vaccinateddose1ratio')
new_df = new_df.set_index('districtid')
district_ratio = new_df.copy()
district_ratio

Unnamed: 0_level_0,vaccinateddose1ratio,vaccinateddose2ratio
districtid,Unnamed: 1_level_1,Unnamed: 2_level_1
CT_Bijapur,0.036491,0.012894
TG_Adilabad,0.037492,0.012564
TG_Mahabubnagar,0.040173,0.015808
MN_Senapati,0.052756,0.010348
AS_Hojai,0.055612,0.012480
TG_Medak,0.057857,0.018973
MP_Harda,0.070311,0.011487
TN_Kancheepuram,0.076711,0.011720
PB_Ferozepur,0.089363,0.022857
CT_Balrampur,0.090650,0.018597


In [58]:
st_df['vaccinateddose1ratio'] = st_df['dose1']/st_df['TOT_P']
st_df['vaccinateddose2ratio'] = st_df['dose2']/st_df['TOT_P']
st_df = st_df[['State_Code', 'vaccinateddose1ratio', 'vaccinateddose2ratio']]
st_df.rename(columns={'State_Code':'stateid'}, inplace=True)
st_df = st_df.sort_values('vaccinateddose1ratio')
st_df = st_df.set_index('stateid')
state_ratio = st_df.copy()
state_ratio

Unnamed: 0_level_0,vaccinateddose1ratio,vaccinateddose2ratio
stateid,Unnamed: 1_level_1,Unnamed: 2_level_1
DL,0.038602,0.019068
WB,0.208623,0.077521
AP,0.218917,0.076675
UP,0.223232,0.042565
BR,0.243402,0.046315
ML,0.25967,0.071237
JH,0.267223,0.06448
TN,0.275305,0.062418
PB,0.28598,0.085877
MH,0.298834,0.105633


In [59]:
overall_df = overall_pop_data.copy()
overall_df['vaccinateddose1ratio'] = overall_df['dose1']/overall_df['TOT_P']
overall_df['vaccinateddose2ratio'] = overall_df['dose2']/overall_df['TOT_P']
overall_df = overall_df[['Name', 'vaccinateddose1ratio', 'vaccinateddose2ratio']]
overall_df.rename(columns={'Name':'id'}, inplace=True)
overall_df = overall_df.set_index('id')
overall_ratio = overall_df.copy()
overall_ratio

Unnamed: 0_level_0,vaccinateddose1ratio,vaccinateddose2ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1
India,0.309987,0.086562


In [60]:
district_ratio.to_csv('../output/district-vaccinated-dose-ratio.csv')

In [61]:
state_ratio.to_csv('../output/state-vaccinated-dose-ratio.csv')

In [62]:
overall_ratio.to_csv('../output/overall-vaccinated-dose-ratio.csv')