In [1]:
import numpy as np
import pandas as pd
import json
from datetime import timedelta
pd.options.mode.chained_assignment = None

## Problem statement
For every district i, find the number of cases from the Covid-19 portal. Take the time-period of analysis from 15th March, 2020 to 14th August, 2021. Output the total number of cases per week for every district in the following manner: districtid, timeid, cases, where timeid is the id of the time (week/month/overall) starting from 1.

In [2]:
#Using previous results to get all the districts which has to be considered for this assignment
distr_data = pd.read_csv('../output/edge-graph.csv', index_col='District_Key')
distr_data.drop(columns=['Unnamed: 0'], inplace=True)
distr_data

Unnamed: 0_level_0,edge_1,edge_2,edge_3,edge_4,edge_5,edge_6,edge_7,edge_8,edge_9,edge_10,edge_11,edge_12,edge_13,edge_14
District_Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AP_Anantapur,KA_Tumakuru,KA_Ballari,KA_Chitradurga,AP_Y.S.R. Kadapa,AP_Kurnool,AP_Chittoor,KA_Chikkaballapura,,,,,,,
AP_Chittoor,KA_Kolar,TN_Thiruvallur,AP_Y.S.R. Kadapa,TN_Krishnagiri,AP_Anantapur,AP_S.P.S. Nellore,KA_Chikkaballapura,TN_Vellore,,,,,,
AP_East Godavari,CT_Sukma,TG_Bhadradri Kothagudem,AP_West Godavari,OR_Malkangiri,AP_Visakhapatnam,,,,,,,,,
AP_Guntur,TG_Nagarkurnool,TG_Suryapet,AP_Krishna,AP_Prakasam,TG_Nalgonda,,,,,,,,,
AP_Krishna,AP_West Godavari,TG_Suryapet,AP_Guntur,TG_Khammam,,,,,,,,,,
AP_Kurnool,KA_Raichur,KA_Ballari,TG_Nagarkurnool,TG_Wanaparthy,AP_Y.S.R. Kadapa,AP_Anantapur,AP_Prakasam,TG_Jogulamba Gadwal,,,,,,
AP_Prakasam,TG_Nagarkurnool,AP_Y.S.R. Kadapa,AP_Kurnool,AP_Guntur,AP_S.P.S. Nellore,,,,,,,,,
AP_S.P.S. Nellore,TN_Thiruvallur,AP_Y.S.R. Kadapa,AP_Chittoor,AP_Prakasam,,,,,,,,,,
AP_Srikakulam,OR_Ganjam,OR_Rayagada,OR_Gajapati,AP_Vizianagaram,,,,,,,,,,
AP_Visakhapatnam,OR_Malkangiri,AP_East Godavari,AP_Vizianagaram,OR_Koraput,,,,,,,,,,


In [3]:
districts_key = distr_data.index
print("Total districts available: ", len(districts_key))

Total districts available:  704


So now we have to find the number of cases during the period 15th March, 2020 to 14th August, 2021. Since 'districts.csv' doesn't contain 'District_Key' where as 'district_wise.csv' contains district keys so we have to map keys with district and state to get datas from 'districts.csv'

In [4]:
dt_wise_covid_data = pd.read_csv('../data/districts.csv', parse_dates=['Date'], index_col='Date')
dt_wise_covid_data = dt_wise_covid_data.loc['2020-03-15':'2021-08-14']
dt_wise_covid_data

Unnamed: 0_level_0,State,District,Confirmed,Recovered,Deceased,Other,Tested
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-26,Andaman and Nicobar Islands,Unknown,33,11,0,0,
2020-04-26,Andhra Pradesh,Anantapur,53,14,4,0,
2020-04-26,Andhra Pradesh,Chittoor,73,13,0,0,
2020-04-26,Andhra Pradesh,East Godavari,39,12,0,0,
2020-04-26,Andhra Pradesh,Guntur,214,29,8,0,
2020-04-26,Andhra Pradesh,Krishna,177,29,8,0,
2020-04-26,Andhra Pradesh,Kurnool,279,31,9,0,
2020-04-26,Andhra Pradesh,Prakasam,56,23,0,0,
2020-04-26,Andhra Pradesh,S.P.S. Nellore,72,23,2,0,
2020-04-26,Andhra Pradesh,Srikakulam,3,0,0,0,


In [5]:
distr_wise_data = pd.read_csv('../data/district_wise.csv')
distr_wise_data

Unnamed: 0,SlNo,State_Code,State,District_Key,District,Confirmed,Active,Recovered,Deceased,Migrated_Other,Delta_Confirmed,Delta_Active,Delta_Recovered,Delta_Deceased,District_Notes,Last_Updated
0,0,UN,State Unassigned,UN_Unassigned,Unassigned,0,0,0,0,0,0,0,0,0,,
1,1,AN,Andaman and Nicobar Islands,AN_Nicobars,Nicobars,0,0,0,0,0,0,0,0,0,District-wise numbers are out-dated as cumulat...,
2,2,AN,Andaman and Nicobar Islands,AN_North and Middle Andaman,North and Middle Andaman,1,0,1,0,0,0,0,0,0,District-wise numbers are out-dated as cumulat...,
3,3,AN,Andaman and Nicobar Islands,AN_South Andaman,South Andaman,51,19,32,0,0,0,0,0,0,District-wise numbers are out-dated as cumulat...,
4,0,AP,Andhra Pradesh,AP_Foreign Evacuees,Foreign Evacuees,434,0,434,0,0,0,0,0,0,,
5,4,AP,Andhra Pradesh,AP_Anantapur,Anantapur,157166,109,155966,1091,0,0,0,0,0,,
6,5,AP,Andhra Pradesh,AP_Chittoor,Chittoor,238338,1969,234521,1848,0,0,0,0,0,,
7,6,AP,Andhra Pradesh,AP_East Godavari,East Godavari,285839,2180,282400,1259,0,0,0,0,0,,
8,7,AP,Andhra Pradesh,AP_Guntur,Guntur,172614,1019,170412,1183,0,0,0,0,0,,
9,8,AP,Andhra Pradesh,AP_Krishna,Krishna,113859,2098,110446,1315,0,0,0,0,0,,


In [6]:
#dt_wise_covid_data.loc['2020-03-15':'2021-08-14']

Since 'districts.csv' contains data from '2020-04-26' onwards. So, I'm including raw data from this api https://data.covid19india.org/ to collect data between 15th March 2020 to 25th April 2020.

In [7]:
raw_data1 = pd.read_csv('../data/covid-raw-data/raw_data1.csv', parse_dates=['Date Announced'])
raw_data2 = pd.read_csv('../data/covid-raw-data/raw_data2.csv', parse_dates=['Date Announced'])

In [8]:
raw_data1 = raw_data1.sort_values(by=['Date Announced'])
raw_data1

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Estimated Onset Date,Age Bracket,Gender,Detected City,Detected District,Detected State,State code,...,Notes,Contracted from which Patient (Suspected),Nationality,Type of transmission,Status Change Date,Source_1,Source_2,Source_3,Backup Notes,Num Cases
1683,1668.0,,2020-01-04,,67,M,,Ahmedabad,Gujarat,GJ,...,Local Transmission,,India,Local,01/04/2020,https://twitter.com/ANI/status/124521978303481...,,,,1
1771,1762.0,TN-P150,2020-01-04,,40,M,Annaimalai,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1772,1763.0,TN-P151,2020-01-04,,34,M,Pollachi,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1773,1764.0,TN-P152,2020-01-04,,36,M,Annaimalai,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1774,1765.0,TN-P153,2020-01-04,,41,M,Annaimalai,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1775,1766.0,TN-P154,2020-01-04,,38,M,Annaimalai,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1776,1767.0,TN-P155,2020-01-04,,39,M,Pollachi,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1777,1768.0,TN-P156,2020-01-04,,40,M,Mettupalayam,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1778,1769.0,TN-P157,2020-01-04,,30,M,Mettupalayam,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1
1779,1770.0,TN-P158,2020-01-04,,45,M,Mettupalayam,Coimbatore,Tamil Nadu,TN,...,Travelled to Delhi,E0,,Local,01/04/2020,https://twitter.com/NHM_TN/status/124533236456...,,,,1


In [9]:
raw_data1 = raw_data1[(raw_data1['Date Announced'] >= '2020-03-15') & (raw_data1['Date Announced'] < '2020-04-20')]
raw_data1

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Estimated Onset Date,Age Bracket,Gender,Detected City,Detected District,Detected State,State code,...,Notes,Contracted from which Patient (Suspected),Nationality,Type of transmission,Status Change Date,Source_1,Source_2,Source_3,Backup Notes,Num Cases
111,113.0,,2020-03-15,,,M,Dehradun,Dehradun,Uttarakhand,UT,...,"Travelled from Spain, Italy and Russia March 11",,India,Imported,15/03/2020,https://www.firstpost.com/health/coronavirus-o...,,,"Probationer travelled to Spain, Italy and Russ...",1
110,112.0,,2020-03-15,,46,,Hyderabad,Hyderabad,Telangana,TG,...,Travelled from Netherlands March 9,,India,Imported,15/03/2020,https://www.businessinsider.in/india/news/coro...,,,Netherlands travel history March 9,1
109,110.0,,2020-03-15,,59,F,Aurangabad,Aurangabad,Maharashtra,MH,...,Travelled from Russia and Kazakhstan,,India,Imported,15/03/2020,https://twitter.com/timesofindia/status/123909...,https://www.indiatoday.in/india/story/coronavi...,,Travel history to Russia and Kazakhstan,1
108,109.0,KL-TV-R4,2020-03-15,,,M,Thiruvananthapuram,Thiruvananthapuram,Kerala,KL,...,"Travelled from Spain, Doctor",,India,Imported,02/04/2020,https://twitter.com/ANI/status/123918376494673...,https://www.thehindu.com/news/cities/Thiruvana...,http://dhs.kerala.gov.in/wp-content/uploads/20...,Doctor who had been to Spain,1
107,108.0,KL-ID-1,2020-03-15,,,M,Munnar,Idukki,Kerala,KL,...,"Travelled from Dubai, British citizen who had ...",,United Kingdom,Imported,28/03/2020,https://twitter.com/ANI/status/123918376494673...,https://economictimes.indiatimes.com/news/poli...,,British citizen who had resided in Munnar resort,1
106,103.0,,2020-03-15,,21,M,Pimpri-Chinchwad,Pune,Maharashtra,MH,...,Travelled from Thailand,,India,Imported,14/03/2020,https://twitter.com/ANI/status/123888256798766...,https://www.hindustantimes.com/india-news/5-ne...,,Travelled from Thailand,1
104,101.0,,2020-03-15,,,F,Pimpri-Chinchwad,Pune,Maharashtra,MH,...,First contacts or family members of the group ...,,India,Local,14/03/2020,https://twitter.com/ANI/status/123888256798766...,https://www.hindustantimes.com/india-news/5-ne...,,First contacts or family members of the group ...,1
103,100.0,,2020-03-15,,,F,Pimpri-Chinchwad,Pune,Maharashtra,MH,...,First contacts or family members of the group ...,,India,Local,14/03/2020,https://twitter.com/ANI/status/123888256798766...,https://www.hindustantimes.com/india-news/5-ne...,,First contacts or family members of the group ...,1
102,99.0,,2020-03-15,,,F,Pimpri-Chinchwad,Pune,Maharashtra,MH,...,First contacts or family members of the group ...,,India,Local,14/03/2020,https://twitter.com/ANI/status/123888256798766...,https://www.hindustantimes.com/india-news/5-ne...,,First contacts or family members of the group ...,1
105,102.0,,2020-03-15,,,M,Pimpri-Chinchwad,Pune,Maharashtra,MH,...,First contacts or family members of the group ...,,India,Local,14/03/2020,https://twitter.com/ANI/status/123888256798766...,https://www.hindustantimes.com/india-news/5-ne...,,First contacts or family members of the group ...,1


In [10]:
raw_data2 = raw_data2[raw_data2['Date Announced'] < '2020-04-26']
raw_data2

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Estimated Onset Date,Age Bracket,Gender,Detected City,Detected District,Detected State,State code,...,Notes,Contracted from which Patient (Suspected),Nationality,Type of transmission,Status Change Date,Source_1,Source_2,Source_3,Backup Notes,Num Cases
0,17307.0,,2020-04-20,,48.0,M,,Bhadrak,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
1,17308.0,,2020-04-20,,55.0,M,,Bhadrak,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
2,17309.0,,2020-04-20,,57.0,M,,Bhadrak,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
3,17310.0,,2020-04-20,,58.0,M,,Bhadrak,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
4,17311.0,,2020-04-20,,66.0,M,,Bhadrak,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
5,17312.0,,2020-04-20,,32.0,M,,Balasore,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
6,17313.0,,2020-04-20,,2.0,F,,Balasore,Odisha,OR,...,Details awaited,,,,20/04/2020,https://twitter.com/the_hindu/status/125205985...,,,,1
7,17314.0,,2020-04-20,,,,,,West Bengal,WB,...,Details awaited,,,,20/04/2020,mohfw.gov.in,,,,1
8,17315.0,,2020-04-20,,,,,,West Bengal,WB,...,Details awaited,,,,20/04/2020,mohfw.gov.in,,,,1
9,17316.0,,2020-04-20,,,,,,West Bengal,WB,...,Details awaited,,,,20/04/2020,mohfw.gov.in,,,,1


In [11]:
# raw_data1 = raw_data1.groupby(['Date Announced','Detected State', 'Detected District']).sum()
# raw_data1.drop(columns=['Patient Number', 'Estimated Onset Date'], inplace=True)
# raw_data1

In [12]:
# raw_data2 = raw_data2.groupby(['Date Announced', 'Detected State', 'Detected District']).sum()
# raw_data2.drop(columns=['Patient Number', 'Estimated Onset Date', 'Age Bracket', 'Nationality', 'Backup Notes'], inplace=True)
# raw_data2

In [13]:
final_raw_data = pd.concat([raw_data1, raw_data2])
final_raw_data = final_raw_data[["Date Announced", "Detected District", "Detected State", "Num Cases"]]
final_raw_data = final_raw_data.rename(columns={'Date Announced': 'Date', 'Detected District': 'District', 'Detected State':'State','Num Cases':'Confirmed'})
final_raw_data = final_raw_data[['Date', 'State', 'District', 'Confirmed']]
# final_raw_data

In [14]:
final_raw_data = final_raw_data.groupby(['Date', 'State', 'District']).agg('sum')
final_raw_data = final_raw_data.reset_index()
#final_raw_data = final_raw_data.set_index('Date')
final_raw_data

Unnamed: 0,Date,State,District,Confirmed
0,2020-03-15,Kerala,Idukki,1
1,2020-03-15,Kerala,Thiruvananthapuram,1
2,2020-03-15,Maharashtra,Aurangabad,1
3,2020-03-15,Maharashtra,Pune,5
4,2020-03-15,Telangana,Hyderabad,1
5,2020-03-15,Uttarakhand,Dehradun,1
6,2020-03-16,Jammu and Kashmir,Jammu,1
7,2020-03-16,Karnataka,Kalaburagi,1
8,2020-03-16,Kerala,Kasaragod,1
9,2020-03-16,Kerala,Malappuram,2


final_raw_data contains all the datas from 15th March 2020 to 25th April 2020 and we already have covid data from 26th August 2020 onwards

In [15]:
#Removing other columns 
df = dt_wise_covid_data.copy()
df.drop(columns=['Recovered', 'Deceased', 'Other', 'Tested'], inplace=True)
df = df.reset_index()
df

Unnamed: 0,Date,State,District,Confirmed
0,2020-04-26,Andaman and Nicobar Islands,Unknown,33
1,2020-04-26,Andhra Pradesh,Anantapur,53
2,2020-04-26,Andhra Pradesh,Chittoor,73
3,2020-04-26,Andhra Pradesh,East Godavari,39
4,2020-04-26,Andhra Pradesh,Guntur,214
5,2020-04-26,Andhra Pradesh,Krishna,177
6,2020-04-26,Andhra Pradesh,Kurnool,279
7,2020-04-26,Andhra Pradesh,Prakasam,56
8,2020-04-26,Andhra Pradesh,S.P.S. Nellore,72
9,2020-04-26,Andhra Pradesh,Srikakulam,3


In [16]:
final_df = pd.concat([final_raw_data, df])
final_df

Unnamed: 0,Date,State,District,Confirmed
0,2020-03-15,Kerala,Idukki,1
1,2020-03-15,Kerala,Thiruvananthapuram,1
2,2020-03-15,Maharashtra,Aurangabad,1
3,2020-03-15,Maharashtra,Pune,5
4,2020-03-15,Telangana,Hyderabad,1
5,2020-03-15,Uttarakhand,Dehradun,1
6,2020-03-16,Jammu and Kashmir,Jammu,1
7,2020-03-16,Karnataka,Kalaburagi,1
8,2020-03-16,Kerala,Kasaragod,1
9,2020-03-16,Kerala,Malappuram,2


In [17]:
# map_dkey_dname={}
# map_dkey_sname={}
# for d_key in districts_key:
#     dd_ff = distr_wise_data[distr_wise_data['District_Key'] == d_key]
#     d_name = dd_ff['District'].iloc[0]
#     s_name = dd_ff['State'].iloc[0]
#     map_dkey_dname[d_key] = d_name
#     map_dkey_sname[d_key] = s_name
# map_dkey_dname

In [18]:
# map_dkey_sname

Since 'districts.csv' data contains cumulative values in the 'Confirmed' column so for finding actual no. of cases in that day we need to subtract it from the previous day.

In [19]:
not_available_distr = []
for dt_key in districts_key:
    print(dt_key)
    #Getting district name and state corresponding to the districts key
    get_dt = distr_wise_data[distr_wise_data['District_Key'] == dt_key]
    original_dt_name = get_dt['District'].iloc[0]
    original_st_name = get_dt['State'].iloc[0]
    tt = final_df[(final_df['District'] == original_dt_name) & (final_df['State'] == original_st_name) & (final_df['Date'] >= '2020-04-25')]
    if len(tt)==0:
        not_available_distr.append(original_dt_name)
        continue
    tt.loc[: ,'Confirmed'] = tt['Confirmed'].diff().fillna(tt['Confirmed'].iloc[0])
    final_df.loc[((final_df['District']==original_dt_name) & (final_df['State'] == original_st_name) & (final_df['Date'] >= '2020-04-25')), 'Confirmed'] = tt[['Confirmed']]

AP_Anantapur
AP_Chittoor
AP_East Godavari
AP_Guntur
AP_Krishna
AP_Kurnool
AP_Prakasam
AP_S.P.S. Nellore
AP_Srikakulam
AP_Visakhapatnam
AP_Vizianagaram
AP_West Godavari
AP_Y.S.R. Kadapa
AR_Anjaw
AR_Changlang
AR_East Kameng
AR_East Siang
AR_Kamle
AR_Kra Daadi
AR_Kurung Kumey
AR_Lepa Rada
AR_Lohit
AR_Longding
AR_Lower Dibang Valley
AR_Lower Siang
AR_Lower Subansiri
AR_Namsai
AR_Pakke Kessang
AR_Papum Pare
AR_Shi Yomi
AR_Siang
AR_Tawang
AR_Tirap
AR_Upper Dibang Valley
AR_Upper Siang
AR_Upper Subansiri
AR_West Kameng
AR_West Siang
AS_Baksa
AS_Barpeta
AS_Biswanath
AS_Bongaigaon
AS_Cachar
AS_Charaideo
AS_Chirang
AS_Darrang
AS_Dhemaji
AS_Dhubri
AS_Dibrugarh
AS_Dima Hasao
AS_Goalpara
AS_Golaghat
AS_Hailakandi
AS_Hojai
AS_Jorhat
AS_Kamrup
AS_Kamrup Metropolitan
AS_Karbi Anglong
AS_Karimganj
AS_Kokrajhar
AS_Lakhimpur
AS_Majuli
AS_Morigaon
AS_Nagaon
AS_Nalbari
AS_Sivasagar
AS_Sonitpur
AS_South Salmara Mankachar
AS_Tinsukia
AS_Udalguri
AS_West Karbi Anglong
BR_Araria
BR_Arwal
BR_Aurangabad
BR_Banka

UP_Sant Kabir Nagar
UP_Shahjahanpur
UP_Shamli
UP_Shrawasti
UP_Siddharthnagar
UP_Sitapur
UP_Sonbhadra
UP_Sultanpur
UP_Unnao
UP_Varanasi
UT_Almora
UT_Bageshwar
UT_Chamoli
UT_Champawat
UT_Dehradun
UT_Haridwar
UT_Nainital
UT_Pauri Garhwal
UT_Pithoragarh
UT_Rudraprayag
UT_Tehri Garhwal
UT_Udham Singh Nagar
UT_Uttarkashi
WB_Alipurduar
WB_Bankura
WB_Birbhum
WB_Dakshin Dinajpur
WB_Darjeeling
WB_Hooghly
WB_Howrah
WB_Jalpaiguri
WB_Jhargram
WB_Kalimpong
WB_Kolkata
WB_Malda
WB_Murshidabad
WB_Nadia
WB_North 24 Parganas
WB_Paschim Bardhaman
WB_Paschim Medinipur
WB_Purba Bardhaman
WB_Purba Medinipur
WB_Purulia
WB_South 24 Parganas
WB_Uttar Dinajpur


In [None]:
districts_availble = distr_wise_data['District_Key'].unique()
for distr_key in districts_availble: 
    temp_df = distr_wise_data[distr_wise_data['District_Key'] == distr_key]
    dt_name = temp_df['District'].iloc[0]
    st_name = temp_df['State'].iloc[0]
    if distr_key not in list(districts_key):
        print("Dropping this district key: ", distr_key)
        idx_list = list(final_df[(final_df['District'] == dt_name) & (final_df['State'] == st_name)].index)
        if len(idx_list) > 0:
            final_df.drop(idx_list, inplace=True)
    else:
        final_df.loc[((final_df['District'] == dt_name) & (final_df['State'] == st_name)), 'District_Key'] = distr_key

Dropping this district key:  UN_Unassigned
Dropping this district key:  AN_Nicobars
Dropping this district key:  AN_North and Middle Andaman
Dropping this district key:  AN_South Andaman
Dropping this district key:  AP_Foreign Evacuees
Dropping this district key:  AP_Other State
Dropping this district key:  AS_Airport Quarantine
Dropping this district key:  AS_Other State
Dropping this district key:  CT_Other State
Dropping this district key:  CT_Gaurela Pendra Marwahi
Dropping this district key:  DN_Other State
Dropping this district key:  GA_Other State
Dropping this district key:  GJ_Other State
Dropping this district key:  HP_Bilaspur
Dropping this district key:  HR_Foreign Evacuees
Dropping this district key:  HR_Italians


In [None]:
final_df = final_df.set_index('Date')
final_df

In [None]:
# final_df['District_Key'] = final_df['District'].apply(lambda x: map_dname_dkey[x])

In [None]:
final_df = final_df[['State', 'District', 'District_Key', 'Confirmed']]
final_df

In [None]:
#These districts are not available in the districts.csv and raw_data.csv files
#So I'm dropping these districts from the final_df
not_available_distr

In [None]:
wdf = pd.DataFrame(index = final_df.index)
wdf = wdf.reset_index()
wdf['Date'] = pd.to_datetime(wdf['Date'], format='%d/%m/%Y')

In [None]:
def get_week_or_month_wise_data(wdf, start_date, end_date, timedelay):
    i=1
    wwdf=wdf.copy()
    start_date1 = start_date
    if timedelay==7:
        word='week'
    else: word='month'
    while start_date <= end_date:
        wwdf.loc[((wwdf['Date'] >= start_date) & (wwdf['Date'] < start_date+timedelta(days=timedelay))), word] = i
        start_date += timedelta(days=timedelay)
        start_date = pd.to_datetime(start_date,format='%d/%m/%Y')
        i+=1
    wwdf = wwdf.set_index('Date')
    return wwdf

In [None]:
# from datetime import timedelta
start_date = pd.to_datetime('15/03/2020', format='%d/%m/%Y')
end_date = pd.to_datetime('14/08/2021', format='%d/%m/%Y')
mdf = get_week_or_month_wise_data(wdf, start_date, end_date, 30)
wdf = get_week_or_month_wise_data(wdf, start_date, end_date, 7)
wdf

In [None]:
mdf

In [None]:
ddf = final_df.copy()
new_df = pd.concat([ddf, wdf, mdf], axis=1)
new_df = new_df[["State", "District", "District_Key", "week", "month", "Confirmed"]]
new_df

In [None]:
final_week_wise_output = new_df.groupby(["State", "District", "District_Key", "week"]).sum()
final_week_wise_output = final_week_wise_output.reset_index()
final_week_wise_output = final_week_wise_output[["District_Key", "week", "Confirmed"]]
final_week_wise_output.rename(columns={"District_Key":"districtid", "week":"timeid", "Confirmed":"cases"}, inplace=True)
final_week_wise_output = final_week_wise_output.astype({"timeid": int, "cases": int})
final_week_wise_output

In [None]:
final_month_wise_output = new_df.groupby(["State", "District", "District_Key", "month"]).sum()
final_month_wise_output = final_month_wise_output.reset_index()
final_month_wise_output = final_month_wise_output[["District_Key", "month", "Confirmed"]]
final_month_wise_output.rename(columns={"District_Key":"districtid", "month":"timeid", "Confirmed":"cases"}, inplace=True)

final_month_wise_output = final_month_wise_output.astype({"timeid": int, "cases": int})
final_month_wise_output

In [None]:
overall_df = new_df.groupby(["District_Key"]).sum()
overall_df = overall_df.reset_index()
overall_df = overall_df[["District_Key", "Confirmed"]]
overall_df = overall_df.astype({"Confirmed": int})
overall_df.rename(columns={"District_Key":"districtid", "Confirmed": "cases"}, inplace=True)
overall_df

In [None]:
final_week_wise_output = final_week_wise_output.set_index('districtid')
final_month_wise_output = final_month_wise_output.set_index('districtid')
overall_df = overall_df.set_index('districtid')

In [None]:
# final_week_wise_output = final_week_wise_output.sort_index()
# final_month_wise_output = final_month_wise_output.sort_index()
# overall_df = overall_df.sort_index()

In [None]:
# final_week_wise_output = final_week_wise_output.sort_values('timeid')
# final_month_wise_output = final_month_wise_output.sort_values('timeid')

In [None]:
final_week_wise_output.to_csv('../output/cases-week.csv')

In [None]:
final_month_wise_output.to_csv('../output/cases-month.csv')

In [None]:
overall_df.to_csv('../output/cases-overall.csv')