### Question 3. (10 marks) For every district i, find the number of cases from the Covid-19 portal. Take the time-period of analysis from 15th March, 2020 to 14th August, 2021. Output the total number of cases per week for every district in the following manner: districtid, timeid, cases, where timeid is the id of the time (week/month/overall) starting from 1. Call this output file cases-time.csv and the script/program to generate this case-generator.sh where time is week, month, and overall.

# 1. Importing the necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
import datetime
from dateutil import relativedelta  # used for handling dates and doing relative arithmetic

# 2. Load districts cases data and clean

In [2]:
# Load the districts.csv file
districts_cases = pd.read_csv('./dataset/districts.csv', dtype='string')
districts_cases.head()

Unnamed: 0,Date,State,District,Confirmed,Recovered,Deceased,Other,Tested
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,
1,2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,
2,2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,
3,2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,
4,2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,


## (i) Drop the columns which are not required

In [3]:
# delete the columns which are not required
districts_cases.drop(['Recovered', 'Deceased', 'Other', 'Tested'], axis=1, inplace=True)
districts_cases.head()

Unnamed: 0,Date,State,District,Confirmed
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33
1,2020-04-26,Andhra Pradesh,Anantapur,53
2,2020-04-26,Andhra Pradesh,Chittoor,73
3,2020-04-26,Andhra Pradesh,East Godavari,39
4,2020-04-26,Andhra Pradesh,Guntur,214


## (ii) Convert the 'Confirmed' column to numeric type

In [4]:
districts_cases['Confirmed'] = districts_cases['Confirmed'].apply(pd.to_numeric, errors='ignore')
print(districts_cases.dtypes)

Date         string
State        string
District     string
Confirmed     int64
dtype: object


# 3. Load the cleaned vaccination data (done in Q1)

In [5]:
# Load the cowin_vaccine_data_districtwise.csv
cowin_vaccine_data_districtwise = pd.read_csv('./dataset/cowin_vaccine_data_districtwise_clean.csv', dtype='string')
cowin_vaccine_data_districtwise.head()

# convert number values columns to numeric
cowin_vaccine_data_districtwise.iloc[:, 4:] = cowin_vaccine_data_districtwise.iloc[:, 4:].apply(
                                                                    pd.to_numeric, errors='ignore')
print('The datatypes of columns containing numeric values has been changed from string to numeric')

The datatypes of columns containing numeric values has been changed from string to numeric


# 4. Find the common district names between vaccine data and cases data

In [6]:
# Find the unique district names in the vaccine data
district_names_from_vaccine_data = cowin_vaccine_data_districtwise['District'].dropna().unique()
district_names_from_vaccine_data = [district_names.lower() for district_names in district_names_from_vaccine_data]
print('Number of unique districts in vaccine data =', len(district_names_from_vaccine_data))
      
# Find the unique district names from the districts cases data
district_names_from_districts_cases = districts_cases['District'].dropna().unique()
district_names_from_districts_cases = [district_name.lower() for district_name in district_names_from_districts_cases]
print('Number of unique districts in cases data =', len(district_names_from_districts_cases))

# find the common districts between the unique districts of cases data and vaccine data
common_districts_vaccine_and_cases = set(district_names_from_districts_cases).intersection(district_names_from_vaccine_data)
print('There are', len(common_districts_vaccine_and_cases), 'districts common between the vaccine data and cases data')

Number of unique districts in vaccine data = 714
Number of unique districts in cases data = 643
There are 626 districts common between the vaccine data and cases data


# 5. Prepare the output files

1. The time period of analysis is from 15th March, 2020 to 14th August, 2021

2. Prepare cases-overall.csv
    - The file 'cases-overall.csv' will contain the the overall cases for each district.
3. Prepare cases-month.csv
    - The file 'cases-month.csv' will contain the the monthly cases for each district.
    - A month starts on 15th and ends on 14th of next month.
4. Prepare cases-week.csv
    - The file 'cases-week.csv' will contain the the weekly cases for each district.
    - A week starts on Sunday and ends on Saturday

In [7]:
def cases_between_time(data, start_date, end_date):
    '''
    Helper function to extract the number of cases that arise in a given duration.
    Input: data, start_date, end_date
    Output: cases in this duration
    Logic: cases = cases on end_date - cases on the day before start_date
    Note: The data is cumulative.
    '''
    # calculate the day before start date (will be useful since the data is cumulative)
    day_before_start_date = start_date - datetime.timedelta(days=1)
    # change date format to match the format in dataframe
    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    day_before_start_date = day_before_start_date.strftime('%Y-%m-%d')
    try:
        cases_r = data[data['Date'] == end_date]['Confirmed'].values[0]
    except:
        # assign 0 if the data doesn't exist for that date
        cases_r = 0
    try:
        cases_l = data[data['Date'] == day_before_start_date]['Confirmed'].values[0]
    except:
        # assign 0 if the data doesn't exist for that date
        cases_l = 0
    return cases_r - cases_l

In [8]:
# Prepare a file for overall cases in each district
# Total number of cases in a day = cases confirmed today - previous day cases
cases_overall = pd.DataFrame(columns=['districtid', 'overallid', 'cases'])

# Prepare a file for monthly cases in each district
# First month is 15/03/2020-14/04/2020
# Last month is 15/07/2021-14/08/2021
# Total number of cases in a month = (cases on last date) - (cases on a day before first day)
cases_month = pd.DataFrame(columns=['districtid', 'monthid', 'cases'])

# Prepare a file for weekly cases in each district
# First week is 15/03/2020-21/04/2020
# Last week is 08/08/2021-14/08/2021
# Total number of cases in a week = (cases on last date) - (cases on a day before first day)
cases_week = pd.DataFrame(columns=['districtid', 'weekid', 'cases'])

for district in list(common_districts_vaccine_and_cases):
    
    ##### Calculate overall cases #####
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/03/2020', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    # change date format to match the format in dataframe
    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    
    # find the district_key for this district
    district_key = cowin_vaccine_data_districtwise[cowin_vaccine_data_districtwise['District'].str.lower() == district]['District_Key'].values[0]

    # find the data for this district
    district_data = districts_cases[districts_cases['District'].str.lower() == district]
    
    # find the cases on the end_date
    # (since the data is cumulative it gives the overall cases for that district)
    cases = district_data[district_data['Date'] == end_date]['Confirmed'].values[0]
    
    # add the data to our dataframe and increment dataframe index
    cases_overall.loc[-1] = [district_key, 'overall', cases]
    cases_overall.index += 1
    
    ##### Calculate monthly cases #####
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/03/2020', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # iterate from start_date to end_date with step size of one month
    monthid = 1
    while start_date < end_date:
        # the current month ends on 14th of next month
        month_end_date = start_date + relativedelta.relativedelta(months=1) - datetime.timedelta(days=1)
        # calculate the cases for this month using a helper function we defined earlier
        cases = cases_between_time(district_data, start_date, month_end_date)
        # append data to the dataframe
        cases_month.loc[-1] = [district_key, monthid, cases]
        cases_month.index += 1
        # update the start_date for the next month
        start_date = month_end_date + datetime.timedelta(days=1)
        monthid += 1
    
    ##### Calculate weekly cases #####
    
    # define start_date and end_date based on our time period of analysis
    start_date = datetime.datetime.strptime('15/03/2020', '%d/%m/%Y')
    end_date = datetime.datetime.strptime('14/08/2021', '%d/%m/%Y')
    
    # iterate from start_date to end_date with step size of one week
    weekid = 1
    while start_date < end_date:
        # the current week ends on saturday (add 6 days to start_date)
        week_end_date = start_date + datetime.timedelta(days=6)
        # calculate the cases for this week using a helper function we defined earlier
        cases = cases_between_time(district_data, start_date, week_end_date)
        # append data to the dataframe
        cases_week.loc[-1] = [district_key, weekid, cases]
        cases_week.index += 1
        # update the start_date for the next week
        start_date = week_end_date + datetime.timedelta(days=1)
        weekid += 1

# dump the data to csv files
cases_overall = cases_overall.sort_values('districtid')
cases_month = cases_month.sort_values('districtid')
cases_week = cases_week.sort_values('districtid')
cases_overall.to_csv('./output/cases-overall.csv', index=False)
cases_month.to_csv('./output/cases-month.csv', index=False)
cases_week.to_csv('./output/cases-week.csv', index=False)

--------------------------------------------------------------------------------- END of Q3 ---------------------------------------------------------------------------------------------