### Q8. (10 marks) For each state, district and overall, find the following ratio: total number of persons vaccinated (both 1 and 2 doses) to total population. (If a district is absent in 2011 census, drop it from analysis.) Output them in the following manner: districtid, vaccinateddose1ratio, vaccinateddose2ratio. Call this output file vaccinated-dose-ratio.csv and the script/program to generate this vaccinated-ratio-generator.sh. Sort the output by the dose-1 ratio.

# 1. Importing the necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
import datetime
from dateutil import relativedelta  # used for handling dates and doing relative arithmetic

# 2. Load the cleaned vaccination data (done in Q1)

In [2]:
# Load the cowin_vaccine_data_districtwise.csv
cowin_vaccine_data_districtwise = pd.read_csv('./dataset/cowin_vaccine_data_districtwise_clean.csv', dtype='string')
cowin_vaccine_data_districtwise.head()

# convert number values columns to numeric
cowin_vaccine_data_districtwise.iloc[:, 4:] = cowin_vaccine_data_districtwise.iloc[:, 4:].apply(
                                                                    pd.to_numeric, errors='ignore')
print('The datatypes of columns containing numeric values has been changed from string to numeric')

The datatypes of columns containing numeric values has been changed from string to numeric


# 3. Load the cleaned census data (done in Q1)

In [3]:
# Load the census_data_clean.csv
census_data = pd.read_csv('./dataset/census_data_clean.csv', dtype='string')
print(census_data.head())

# Convert the columns containing numbers to numeric datatype
census_data.iloc[:, 3:] = census_data.iloc[:, 3:].apply(pd.to_numeric, errors='ignore')
print('The datatypes of columns containing numeric values has been changed from string to numeric')

  State     Level               Name       TOT_P      TOT_M      TOT_F
0    00     India              India  1210854977  623270258  587584719
1    01     STATE  Jammu and Kashmir    12541302    6640662    5900640
2    01  DISTRICT            kupwara      870354     474190     396164
3    01  DISTRICT             budgam      753745     398041     355704
4    01  DISTRICT                leh      133487      78971      54516
The datatypes of columns containing numeric values has been changed from string to numeric


# 4. Find the common districts between census data and vaccine data

In [4]:
# Find the unique district names in the vaccine data
district_names_from_vaccine_data = cowin_vaccine_data_districtwise['District'].dropna().unique()
district_names_from_vaccine_data = [district_names.lower() for district_names in district_names_from_vaccine_data]
print('Number of unique districts in vaccine data =', len(district_names_from_vaccine_data))

# Find the unique district_names in census_data
district_names_from_census_data = census_data[census_data['Level'] == 'DISTRICT']['Name'].dropna().unique()
district_names_from_census_data = [district_names.lower() for district_names in district_names_from_census_data]
print('Number of unique districts in district census data =', len(district_names_from_census_data))

# Find the common districts between the vaccine and census data
common_districts_vaccine_and_census = set(district_names_from_census_data).intersection(district_names_from_vaccine_data)
print('There are', len(common_districts_vaccine_and_census), 'districts common between the vaccine data and district census data')

Number of unique districts in vaccine data = 714
Number of unique districts in district census data = 625
There are 620 districts common between the vaccine data and district census data


# 5. Prepare district-vaccinated-dose-ratio.csv

In [5]:
# Prepare a file for storing vaccination dose ratio for each district
district_vaccinated_dose_ratio = pd.DataFrame(columns=['districtid', 'vaccinateddose1ratio', 'vaccinateddose2ratio'])

for district in list(common_districts_vaccine_and_census):                                                 
    
    # find the vaccination data for this district
    district_data = cowin_vaccine_data_districtwise[cowin_vaccine_data_districtwise['District'].str.lower() == district]
    
    # find population data for this district
    population_data = census_data[(census_data['Level'] == 'DISTRICT') & (census_data['Name'].str.lower() == district)]
                                                     
    # find the district_key for this district
    district_key = district_data['District_Key'].values[0]

    # define start_date and end_date
    # the vaccination data starts from 16 January 2021
    start_date = datetime.datetime.strptime('16/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # change the date format to match the format in dataframe
    start_date = start_date.strftime('%d/%m/%Y')
    end_date = end_date.strftime('%d/%m/%Y')
    
    # calculate total first and second dose administered in this district
    total_first_dose_administered = district_data[end_date + '-' + 'First Dose Administered'].values[0]
    total_second_dose_administered = district_data[end_date + '-' + 'Second Dose Administered'].values[0]

    # calculate the population of this district
    total_population = population_data['TOT_P'].values[0]
    
    # calculate the required ratios, put NaN if division by zero occurs
    if(total_population == 0):
        vaccinated_dose_1_ratio = float('NaN')
        vaccinated_dose_2_ratio = float('NaN')
    else:
        vaccinated_dose_1_ratio = total_first_dose_administered / total_population
        vaccinated_dose_2_ratio = total_second_dose_administered / total_population

    # append data to dataframe
    district_vaccinated_dose_ratio.loc[-1] = [district_key, vaccinated_dose_1_ratio, vaccinated_dose_2_ratio]
    district_vaccinated_dose_ratio.index += 1

# dump data to csv files
district_vaccinated_dose_ratio = district_vaccinated_dose_ratio.sort_values('vaccinateddose1ratio')
district_vaccinated_dose_ratio.to_csv('./output/district-vaccinated-dose-ratio.csv', index=False)
district_vaccinated_dose_ratio.head()

Unnamed: 0,districtid,vaccinateddose1ratio,vaccinateddose2ratio
602,TG_Adilabad,0.037492,0.012564
370,TG_Mahabubnagar,0.040173,0.015808
398,MN_Senapati,0.052756,0.010348
522,TG_Medak,0.057857,0.018973
144,TN_Kancheepuram,0.076711,0.01172


# 6. Find the common states between census data and vaccine data

In [6]:
# Find the unique state names in the vaccine data
state_names_from_vaccine_data = cowin_vaccine_data_districtwise['State'].dropna().unique()
state_names_from_vaccine_data = [state_name.lower() for state_name in state_names_from_vaccine_data]
print('Number of unique state in vaccine data =', len(state_names_from_vaccine_data))

# Find the unique state names in the census
state_names_from_census_data = census_data[census_data['Level'] == 'STATE']['Name'].dropna().unique()
state_names_from_census_data = [state_name.lower() for state_name in state_names_from_census_data]
print('Number of unique state in census data =', len(state_names_from_census_data))

# Find the common states between the vaccine and census data
common_states_vaccine_and_census = set(state_names_from_census_data).intersection(state_names_from_vaccine_data)
print('There are', len(common_states_vaccine_and_census), 'states common between the vaccine data and census data')

Number of unique state in vaccine data = 36
Number of unique state in census data = 35
There are 35 states common between the vaccine data and census data


# 7. Prepare state-vaccinated-dose-ratio.csv

In [7]:
# Prepare a file for storing vaccination dose ratio for each state
state_vaccinated_dose_ratio = pd.DataFrame(columns=['stateid', 'vaccinateddose1ratio', 'vaccinateddose2ratio'])

for state in list(common_states_vaccine_and_census):                                                 
    
    # find the vaccination data for this state
    state_data = cowin_vaccine_data_districtwise[cowin_vaccine_data_districtwise['State'].str.lower() == state]
    
    # find population data for this state
    population_data = census_data[(census_data['Level'] == 'STATE') & (census_data['Name'].str.lower() == state)]
                                                     
    # find the state_code for this district
    state_code = state_data.iloc[0]['State_Code']

    # define start_date and end_date
    # the vaccination data starts from 16 January 2021
    start_date = datetime.datetime.strptime('16/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # change the date format to match the format in dataframe
    start_date = start_date.strftime('%d/%m/%Y')
    end_date = end_date.strftime('%d/%m/%Y')
    
    # calculate total first and second dose administered in this district
    total_first_dose_administered = sum(state_data[end_date + '-' + 'First Dose Administered'])
    total_second_dose_administered = sum(state_data[end_date + '-' + 'Second Dose Administered'])

    # calculate the population of this state
    total_population = population_data['TOT_P'].values[0]
    
    # calculate the required ratios, put NaN if division by zero occurs
    if(total_population == 0):
        vaccinated_dose_1_ratio = float('NaN')
        vaccinated_dose_2_ratio = float('NaN')
    else:
        vaccinated_dose_1_ratio = total_first_dose_administered / total_population
        vaccinated_dose_2_ratio = total_second_dose_administered / total_population

    # append data to dataframe
    state_vaccinated_dose_ratio.loc[-1] = [state_code, vaccinated_dose_1_ratio, vaccinated_dose_2_ratio]
    state_vaccinated_dose_ratio.index += 1

# dump data to csv files
state_vaccinated_dose_ratio = state_vaccinated_dose_ratio.sort_values('vaccinateddose1ratio')
state_vaccinated_dose_ratio.to_csv('./output/state-vaccinated-dose-ratio.csv', index=False)
state_vaccinated_dose_ratio.head()

Unnamed: 0,stateid,vaccinateddose1ratio,vaccinateddose2ratio
6,UP,0.241468,0.045944
2,BR,0.241763,0.046679
5,JH,0.2667,0.064718
19,WB,0.271873,0.102865
16,TN,0.301442,0.067908


# 8. Prepare overall-vaccinated-dose-ratio.csv

In [8]:
# Prepare a file for storing vaccination dose ratio for India (overall)
overall_vaccinated_dose_ratio = pd.DataFrame(columns=['overallid', 'vaccinateddose1ratio', 'vaccinateddose2ratio'])

# define start_date and end_date
# the vaccination data starts from 16 January 2021
start_date = datetime.datetime.strptime('16/01/2021', '%d/%m/%Y')
end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')

# change the date format to match the format in dataframe
start_date = start_date.strftime('%d/%m/%Y')
end_date = end_date.strftime('%d/%m/%Y')

# calculate total first and second dose administered in India (overall)
total_first_dose_administered = sum(cowin_vaccine_data_districtwise[end_date + '-' + 'First Dose Administered'])
total_second_dose_administered = sum(cowin_vaccine_data_districtwise[end_date + '-' + 'Second Dose Administered'])

# calculate the population of India
total_population = census_data[census_data['Level'] == 'India']['TOT_P'].values[0]

# calculate the required ratios, put NaN if division by zero occurs
if(total_population == 0):
    vaccinated_dose_1_ratio = float('NaN')
    vaccinated_dose_2_ratio = float('NaN')
else:
    vaccinated_dose_1_ratio = total_first_dose_administered / total_population
    vaccinated_dose_2_ratio = total_second_dose_administered / total_population

# append data to dataframe
overall_vaccinated_dose_ratio.loc[-1] = ['India', vaccinated_dose_1_ratio, vaccinated_dose_2_ratio]
overall_vaccinated_dose_ratio.index += 1

# dump data to csv files
overall_vaccinated_dose_ratio.to_csv('./output/overall-vaccinated-dose-ratio.csv', index=False)
overall_vaccinated_dose_ratio.head()

Unnamed: 0,overallid,vaccinateddose1ratio,vaccinateddose2ratio
0,India,0.346605,0.098849


--------------------------------------------------------------------------------- END of Q8 ---------------------------------------------------------------------------------------------