### Question 5. (10 marks) Find the number of people vaccinated with 1 or 2 doses of any vaccine, and sort the output file with district id and state id. Output this for all districts and all states weekly, monthly and overall in the following manner: districtid, timeid, dose1, dose2. Call this output file vaccinated-count-time.csv and the script/program to generate this vaccinated-count-generator.sh where time is week, month, and overall.


# 1. Importing the necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
import datetime
from dateutil import relativedelta  # used for handling dates and doing relative arithmetic

# 2. Load the cleaned vaccination data (done in Q1)

In [2]:
# Load the cowin_vaccine_data_districtwise.csv
cowin_vaccine_data_districtwise = pd.read_csv('./dataset/cowin_vaccine_data_districtwise_clean.csv', dtype='string')
cowin_vaccine_data_districtwise.head()

# convert number values columns to numeric
cowin_vaccine_data_districtwise.iloc[:, 4:] = cowin_vaccine_data_districtwise.iloc[:, 4:].apply(
                                                                    pd.to_numeric, errors='ignore')
print('The datatypes of columns containing numeric values has been changed from string to numeric')

The datatypes of columns containing numeric values has been changed from string to numeric


# 3. Find the unique district names in the vaccine data

In [3]:
# Find the unique district names in the vaccine data
district_names_from_vaccine_data = cowin_vaccine_data_districtwise['District'].dropna().unique()
district_names_from_vaccine_data = [district_name.lower() for district_name in district_names_from_vaccine_data]
print('Number of unique districts in vaccine data =', len(district_names_from_vaccine_data))

Number of unique districts in vaccine data = 714


# 4. Find vaccination data for each district

The time period of analysis is taken from 10 Jan 2021 to 14 Aug 2021 for the weekly data.

The time period of analysis is taken from 15 Jan 2021 to 14 Aug 2021 for the monthly data.

The time period of analysis is taken from 15 Jan 2021 to 14 Aug 2021 for the overall data.

In [4]:
def vaccinations_between_time(data, start_date, end_date):
    '''
    Helper function to extract the number of vaccinations in a given duration.
    Input: data, start_date, end_date
    Output: vaccinations in this duration for dose1 as well as dose2
    Logic: vaccinations = vaccinations on end_date - vaccinations on the day before start_date
    Note: The data is cumulative.
    '''
    # calculate the day before start date (will be useful since the data is cumulative)
    day_before_start_date = start_date - datetime.timedelta(days=1)
    # change date format to match the format in dataframe
    start_date = start_date.strftime('%d/%m/%Y')
    end_date = end_date.strftime('%d/%m/%Y')
    day_before_start_date = day_before_start_date.strftime('%d/%m/%Y')
    
    try:
        dose1_r = data[end_date + '-' + 'First Dose Administered'].values[0]
    except:
        # assign 0 if the data doesn't exist for that date
        dose1_r = 0
    try:
        dose1_l = data[day_before_start_date + '-' + 'First Dose Administered'].values[0]
    except:
        # assign 0 if the data doesn't exist for that date
        dose1_l = 0
    try:
        dose2_r = data[end_date + '-' + 'Second Dose Administered'].values[0]
    except:
        # assign 0 if the data doesn't exist for that date
        dose2_r = 0
    try:
        dose2_l = data[day_before_start_date + '-' + 'Second Dose Administered'].values[0]
    except:
        # assign 0 if the data doesn't exist for that date
        dose2_l = 0

    return (dose1_r - dose1_l, dose2_r - dose2_l)

In [5]:
# Prepare three files to store the vaccination data for each district weekly, monthly and overall
district_vaccinated_count_week = pd.DataFrame(columns=['districtid', 'weekid', 'dose1', 'dose2'])
district_vaccinated_count_month = pd.DataFrame(columns=['districtid', 'monthid', 'dose1', 'dose2'])
district_vaccinated_count_overall = pd.DataFrame(columns=['districtid', 'overallid', 'dose1', 'dose2'])

for district in list(district_names_from_vaccine_data):
    
    # find the vaccination data for this district
    district_data = cowin_vaccine_data_districtwise[cowin_vaccine_data_districtwise['District'].str.lower() == district]
    
    # find the district_key for this district
    district_key = district_data['District_Key'].values[0]
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('10/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # iterate from start_date to end_date with step size of one week
    # a week starts from sunday and runs till saturday
    # week1 is 15/03/2020-21/03/2020 and week 2 is 22/03/2020-28/03/20
    weekid = 1
    while start_date < end_date:
        # the current week ends on saturday (add 6 days to start_date)
        week_end_date = start_date + datetime.timedelta(days=6)
        # calculate the vaccinations for this week using a helper function we defined earlier
        vaccinations = vaccinations_between_time(district_data, start_date, week_end_date)
        # append data to our file
        district_vaccinated_count_week.loc[-1] = [district_key, weekid, vaccinations[0], vaccinations[1]]
        district_vaccinated_count_week.index += 1
        weekid += 1
        # update the start_date for the next week
        start_date = week_end_date + datetime.timedelta(days=1)
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # iterate from start_date to end_date with step size of one month
    # First month is 15/03/2020-14/04/2020
    # Last month is 15/07/2021-14/08/2021
    # Total number of vaccinations in a month = (vaccinations on last date) - (vaccinations on a day before first day)
    monthid = 1
    while start_date < end_date:
        # the current month ends on 14th of next month
        month_end_date = start_date + relativedelta.relativedelta(months=1) - datetime.timedelta(days=1)
        # calculate the vaccinations for this month using a helper function we defined earlier
        vaccinations = vaccinations_between_time(district_data, start_date, month_end_date)
        # append data to our file
        district_vaccinated_count_month.loc[-1] = [district_key, monthid, vaccinations[0], vaccinations[1]]
        district_vaccinated_count_month.index += 1
        # update the start_date for the next month
        start_date = month_end_date + datetime.timedelta(days=1)
        monthid += 1
        
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    # change date format to match the format in dataframe
    start_date = start_date.strftime('%d/%m/%Y')
    end_date = end_date.strftime('%d/%m/%Y')
    
    # calculate overall vaccinations for this district
    vaccinations = district_data[end_date + '-' + 'First Dose Administered'].values[0], district_data[end_date + '-' + 'Second Dose Administered'].values[0]
    
    # append data to our file
    district_vaccinated_count_overall.loc[-1] = [district_key, 'overall', vaccinations[0], vaccinations[1]]
    district_vaccinated_count_overall.index += 1

# dump the data to csv files
district_vaccinated_count_week = district_vaccinated_count_week.sort_values('districtid')
district_vaccinated_count_month = district_vaccinated_count_month.sort_values('districtid')
district_vaccinated_count_overall = district_vaccinated_count_overall.sort_values('districtid')
district_vaccinated_count_week.to_csv('./output/district-vaccinated-count-week.csv', index=False)
district_vaccinated_count_month.to_csv('./output/district-vaccinated-count-month.csv', index=False)
district_vaccinated_count_overall.to_csv('./output/district-vaccinated-count-overall.csv', index=False)
print(district_vaccinated_count_week.head())
print(district_vaccinated_count_month.head())
print(district_vaccinated_count_overall.head())

        districtid weekid dose1 dose2
22133  AN_Nicobars      1     0     0
22103  AN_Nicobars     31   163   113
22104  AN_Nicobars     30   465    85
22105  AN_Nicobars     29  1048   286
22106  AN_Nicobars     28  1636   653
       districtid monthid dose1 dose2
4997  AN_Nicobars       1   360     0
4996  AN_Nicobars       2   387   304
4995  AN_Nicobars       3  4893   227
4994  AN_Nicobars       4  2713  1356
4993  AN_Nicobars       5   429    19
                      districtid overallid    dose1   dose2
713                  AN_Nicobars   overall    22081    8110
712  AN_North and Middle Andaman   overall    68043   29281
711             AN_South Andaman   overall   142585   59896
710                 AP_Anantapur   overall  1364374  597346
709                  AP_Chittoor   overall  1565591  640864


# 5. Find the unique states in vaccine data

In [6]:
# Find the unique state names in the vaccine data
state_names_from_vaccine_data = cowin_vaccine_data_districtwise['State'].dropna().unique()
state_names_from_vaccine_data = [state_name.lower() for state_name in state_names_from_vaccine_data]
print('Number of unique state in vaccine data =', len(state_names_from_vaccine_data))

Number of unique state in vaccine data = 36


# 6. Find vaccination data for each state

The time period of analysis is taken from 10 Jan 2021 to 14 Aug 2021 for the weekly data.

The time period of analysis is taken from 15 Jan 2021 to 14 Aug 2021 for the monthly data.

The time period of analysis is taken from 15 Jan 2021 to 14 Aug 2021 for the overall data.

In [7]:
def vaccinations_between_time_for_series(data, start_date, end_date):
    '''
    Helper function to extract the number of vaccinations in a given duration.
    Input: data, start_date, end_date
    Output: vaccinations in this duration for dose1 as well as dose2
    Logic: vaccinations = vaccinations on end_date - vaccinations on the day before start_date
    Note: The data is cumulative.
    '''
    # calculate the day before start date (will be useful since the data is cumulative)
    day_before_start_date = start_date - datetime.timedelta(days=1)
    # change date format to match the format in dataframe
    start_date = start_date.strftime('%d/%m/%Y')
    end_date = end_date.strftime('%d/%m/%Y')
    day_before_start_date = day_before_start_date.strftime('%d/%m/%Y')
    
    try:
        dose1_r = sum(data[end_date + '-' + 'First Dose Administered'])
    except:
        # assign 0 if the data doesn't exist for that date
        dose1_r = 0
    try:
        dose1_l = sum(data[day_before_start_date + '-' + 'First Dose Administered'])
    except:
        # assign 0 if the data doesn't exist for that date
        dose1_l = 0
    try:
        dose2_r = sum(data[end_date + '-' + 'Second Dose Administered'])
    except:
        # assign 0 if the data doesn't exist for that date
        dose2_r = 0
    try:
        dose2_l = sum(data[day_before_start_date + '-' + 'Second Dose Administered'])
    except:
        # assign 0 if the data doesn't exist for that date
        dose2_l = 0

    return (dose1_r - dose1_l, dose2_r - dose2_l)

In [8]:
# Prepare three files to store the vaccination data for each state weekly, monthly and overall
state_vaccinated_count_week = pd.DataFrame(columns=['stateid', 'weekid', 'dose1', 'dose2'])
state_vaccinated_count_month = pd.DataFrame(columns=['stateid', 'monthid', 'dose1', 'dose2'])
state_vaccinated_count_overall = pd.DataFrame(columns=['stateid', 'overallid', 'dose1', 'dose2'])

for state in list(state_names_from_vaccine_data):
    
    # find the vaccination data for this state
    state_data = cowin_vaccine_data_districtwise[cowin_vaccine_data_districtwise['State'].str.lower() == state]
    
    # find the state_code for this state
    state_code = state_data['State_Code'].values[0]
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('10/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # iterate from start_date to end_date with step size of one week
    # a week starts from sunday and runs till saturday
    # week1 is 15/03/2020-21/03/2020 and week 2 is 22/03/2020-28/03/20
    weekid = 1
    while start_date < end_date:
        # the current week ends on saturday (add 6 days to start_date)
        week_end_date = start_date + datetime.timedelta(days=6)
        # calculate the vaccinations for this week using a helper function we defined earlier
        vaccinations = vaccinations_between_time(state_data, start_date, week_end_date)
        # append data to our file
        state_vaccinated_count_week.loc[-1] = [state_code, weekid, vaccinations[0], vaccinations[1]]
        state_vaccinated_count_week.index += 1
        weekid += 1
        # update the start_date for the next week
        start_date = week_end_date + datetime.timedelta(days=1)
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # iterate from start_date to end_date with step size of one month
    # First month is 15/03/2020-14/04/2020
    # Last month is 15/07/2021-14/08/2021
    # Total number of vaccinations in a month = (vaccinations on last date) - (vaccinations on a day before first day)
    monthid = 1
    while start_date < end_date:
        # the current month ends on 14th of next month
        month_end_date = start_date + relativedelta.relativedelta(months=1) - datetime.timedelta(days=1)
        # calculate the vaccinations for this month using a helper function we defined earlier
        vaccinations = vaccinations_between_time(state_data, start_date, month_end_date)
        # append data to our file
        state_vaccinated_count_month.loc[-1] = [state_code, monthid, vaccinations[0], vaccinations[1]]
        state_vaccinated_count_month.index += 1
        # update the start_date for the next month
        start_date = month_end_date + datetime.timedelta(days=1)
        monthid += 1
        
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/01/2021', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    # change date format to match the format in dataframe
    start_date = start_date.strftime('%d/%m/%Y')
    end_date = end_date.strftime('%d/%m/%Y')
    
    # calculate overall vaccinations for this district
    vaccinations = sum(state_data[end_date + '-' + 'First Dose Administered']), sum(state_data[end_date + '-' + 'Second Dose Administered'])
    
    # append data to our file
    state_vaccinated_count_overall.loc[-1] = [state_code, 'overall', vaccinations[0], vaccinations[1]]
    state_vaccinated_count_overall.index += 1

# dump the data to csv files
state_vaccinated_count_week = state_vaccinated_count_week.sort_values('stateid')
state_vaccinated_count_month = state_vaccinated_count_month.sort_values('stateid')
state_vaccinated_count_overall = state_vaccinated_count_overall.sort_values('stateid')
state_vaccinated_count_week.to_csv('./output/state-vaccinated-count-week.csv', index=False)
state_vaccinated_count_month.to_csv('./output/state-vaccinated-count-month.csv', index=False)
state_vaccinated_count_overall.to_csv('./output/state-vaccinated-count-overall.csv', index=False)
print(state_vaccinated_count_week.head())
print(state_vaccinated_count_month.head())
print(state_vaccinated_count_overall.head())

     stateid weekid dose1 dose2
1115      AN      1     0     0
1085      AN     31   163   113
1086      AN     30   465    85
1087      AN     29  1048   286
1088      AN     28  1636   653
    stateid monthid dose1 dose2
251      AN       1   360     0
250      AN       2   387   304
249      AN       3  4893   227
248      AN       4  2713  1356
247      AN       5   429    19
   stateid overallid     dose1    dose2
35      AN   overall    232709    97287
34      AP   overall  18516141  6485212
33      AR   overall    700977   199737
32      AS   overall  11430053  2442900
31      BR   overall  25167378  4859267


--------------------------------------------------------------------------------- END of Q5 ---------------------------------------------------------------------------------------------