# Preparation for Data Assimilation

In [1]:
import pandas as pd
import numpy as np
from datetime import date
from datetime import timedelta
import time
from pathlib import Path 
import os

Note: I generally drop the current day's data in the daily analysis since it's incomplete and makes the graphs inaccurate. So I've designed the code in such a way that it saves the export file in the name of the previous day's date. Data for the current day is also present in the exported file even though the file is named after the previous day.

In [2]:
#Find yesterday's date and convert it to string
today=date.today()
yesterday=today-timedelta(days=1)
ydate=yesterday.strftime("%d-%m-%Y")

# Setting up directories and file paths

In [3]:
data_folder=Path("../data") #Change the path to desired location of yours

# Download Data from API

In [4]:
#Specify URLs
url_r1="https://api.covid19india.org/csv/latest/raw_data1.csv"
url_r2="https://api.covid19india.org/csv/latest/raw_data2.csv"
url_r3="https://api.covid19india.org/csv/latest/raw_data3.csv"
url_r4="https://api.covid19india.org/csv/latest/raw_data4.csv"
url_r5="https://api.covid19india.org/csv/latest/raw_data5.csv"
url_dnr1="https://api.covid19india.org/csv/latest/death_and_recovered1.csv"
url_dnr2="https://api.covid19india.org/csv/latest/death_and_recovered2.csv"

In [5]:
#Creating Data Frames
raw_1=pd.read_csv(url_r1)
raw_2=pd.read_csv(url_r2)
raw_3=pd.read_csv(url_r3)
raw_4=pd.read_csv(url_r4)
raw_5=pd.read_csv(url_r5)
dnr_1=pd.read_csv(url_dnr1)
dnr_2=pd.read_csv(url_dnr2)

# Things to do

1. Join raw1 and 2 
2. Change Current Status - Hospitalized
3. Join DNR1 and 2 
4. Match DNR12 columns to Raw12
5. Append DNR12 to Raw12
6. Join Raw3, 4, 5 and append to the above

# Make Changes as per Things to do

In [6]:
# Concatenate raw 1&2, raw3&4, dnr1&2
frame1=[raw_1, raw_2]
frame2=[raw_3, raw_4, raw_5]
frame3=[dnr_1, dnr_2]

raw_12=pd.concat(frame1)
raw_345=pd.concat(frame2)
dnr_12=pd.concat(frame3)

In [7]:
#Change all values of Current Status to Hospitalized
raw_12['Current Status']="Hospitalized"

In [8]:
#Clean Raw data 12, select columns in specific order and rename in a standard format
raw_12=raw_12[['Date Announced', 'Age Bracket', 'Gender', 'Detected District', 'Detected State', 'Current Status']]
raw_12.rename(columns={'Date Announced':'Date', 'Age Bracket':'Age', 'Detected District':'District', 'Detected State':'State', 'Current Status':'Status'}, inplace=True)

In [9]:
#Clean DNR data 12, select columns in specific order and rename in a standard format
dnr_12=dnr_12[['Date', 'Age Bracket', 'Gender', 'District', 'State', 'Patient_Status']]
dnr_12.rename(columns={'Age Bracket':'Age', 'Patient_Status':'Status'}, inplace=True)

In [10]:
#Join raw12 with dnr12 and add num_cases as 1 for all rows
frame4=[raw_12, dnr_12]
preraw=pd.concat(frame4)
preraw['Num_Cases']=1

In [11]:
#Clean raw data 345, select columns in specific order and rename in a standard format
raw_345=raw_345[['Date Announced', 'Age Bracket', 'Gender', 'Detected District', 'Detected State', 'Current Status', 'Num Cases']]
raw_345.rename(columns={'Date Announced':'Date', 'Age Bracket':'Age', 'Detected District':'District', 'Detected State':'State', 'Current Status':'Status', 'Num Cases':'Num_Cases'}, inplace=True)

In [12]:
#Join preraw and raw34 to final raw data
frame5=[preraw, raw_345]
raw_data=pd.concat(frame5)

In [13]:
# Final Cleaning of data
raw_data=raw_data.dropna(subset=['Num_Cases']) #Drop rows where num_cases are empty
raw_data=raw_data.dropna(subset=['State']) #Drop rows where State data is not available
raw_data['Date']= pd.to_datetime(raw_data['Date'], dayfirst=True) #Change format of Date to datetime format
raw_data['Num_Cases']=pd.to_numeric(raw_data['Num_Cases'], downcast='integer') #Change format of num_cases to integer
raw_data.reset_index(drop=True,inplace=True) #resets index so that there's continuous numbering. Just for the looks.

# Export final cleaned data to data folder

In [14]:
#Export raw_data to yesterday's date.csv in the data folder
fname=ydate+".csv"
fpath=data_folder/fname
raw_data.to_csv(fpath)
print('Done')

Done


This exported data can be used for further analysis and contains most of the important data required for time-series analysis. 

In [15]:
raw_data.tail()

Unnamed: 0,Date,Age,Gender,District,State,Status,Num_Cases
72487,2020-05-29,,,Kamrup Metropolitan,Assam,Recovered,6
72488,2020-05-29,,,Nainital,Uttarakhand,Hospitalized,1
72489,2020-05-29,,,Dehradun,Uttarakhand,Recovered,3
72490,2020-05-29,,,Nainital,Uttarakhand,Recovered,-8
72491,2020-05-29,,,Udham Singh Nagar,Uttarakhand,Recovered,15
