## Data Source

https://api.covid19india.org/

## Import libraries

In [34]:
# to get web contents
import requests
# to parse json contents
import json
# to parse csv files
import csv

# for numerical operations
import numpy as np
# to store and analysis data in dataframes
import pandas as pd

## Get data

In [35]:
# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)

In [36]:
# contents inside json file
parsed.keys()

dict_keys(['raw_data'])

## Save data

In [37]:
# save data in a dataframe
df = pd.DataFrame(parsed['raw_data'])
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,agebracket,backupnotes,contractedfromwhichpatientsuspected,currentstatus,dateannounced,detectedcity,detecteddistrict,detectedstate,estimatedonsetdate,gender,nationality,notes,patientnumber,source1,source2,source3,statecode,statepatientnumber,statuschangedate,typeoftransmission
0,20.0,Student from Wuhan,,Recovered,30/01/2020,Thrissur,Thrissur,Kerala,,F,India,Travelled from Wuhan,1,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-TS-P1,14/02/2020,Imported
1,,Student from Wuhan,,Recovered,02/02/2020,Alappuzha,Alappuzha,Kerala,,,India,Travelled from Wuhan,2,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-AL-P1,14/02/2020,Imported
2,,Student from Wuhan,,Recovered,03/02/2020,Kasaragod,Kasaragod,Kerala,,,India,Travelled from Wuhan,3,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,KL,KL-KS-P1,14/02/2020,Imported


In [38]:
# shade of the dataframe
df.shape

(27891, 20)

In [39]:
# list of columns
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')

In [40]:
# creating patient id column from patient number
# ===============================================

df['p_id'] = df['patientnumber'].apply(lambda x : 'P'+str(x))
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission', 'p_id'],
      dtype='object')

## Rearrange and rename columns

In [41]:
# order of columns
cols = ['patientnumber', 'p_id', 'statepatientnumber', 
        'dateannounced', 'agebracket', 'gender', 
        'detectedcity', 'detecteddistrict', 'detectedstate', 'statecode', 'nationality',
        'typeoftransmission', 'contractedfromwhichpatientsuspected',
        'statuschangedate', 'currentstatus', 'estimatedonsetdate',
        'source1', 'source2', 'source3', 'notes', 'backupnotes']

# rearrange columns
df = df[cols]

# rename columns
df.columns = ['patient_number', 'p_id', 'state_patient_number', 
              'date_announced', 'age_bracket', 'gender', 
              'detected_city', 'detected_district', 'detected_state', 'state_code', 'nationality',
              'type_of_transmission', 'contracted_from_which_patient_suspected',
              'status_change_date', 'current_status', 'estimated_onset_date',
              'source1', 'source2', 'source3', 'notes', 'backup_notes']

# dataframe shape
df.shape

(27891, 21)

In [42]:
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,patient_number,p_id,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,...,type_of_transmission,contracted_from_which_patient_suspected,status_change_date,current_status,estimated_onset_date,source1,source2,source3,notes,backup_notes
0,1,P1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
1,2,P2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
2,3,P3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Travelled from Wuhan,Student from Wuhan


## Missing values

In [43]:
# no. of empty values in each column
# ==================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]==''].shape[0])

(27891, 21) 

patient_number 	 0
p_id 	 0
state_patient_number 	 24894
date_announced 	 0
age_bracket 	 25545
gender 	 22578
detected_city 	 25544
detected_district 	 7640
detected_state 	 1
state_code 	 1
nationality 	 25184
type_of_transmission 	 24901
contracted_from_which_patient_suspected 	 26328
status_change_date 	 111
current_status 	 0
estimated_onset_date 	 27891
source1 	 384
source2 	 24018
source3 	 27439
notes 	 1336
backup_notes 	 27530


In [44]:
# no. of non-empty values in each column
# ===================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]!=''].shape[0])

(27891, 21) 

patient_number 	 27891
p_id 	 27891
state_patient_number 	 2997
date_announced 	 27891
age_bracket 	 2346
gender 	 5313
detected_city 	 2347
detected_district 	 20251
detected_state 	 27890
state_code 	 27890
nationality 	 2707
type_of_transmission 	 2990
contracted_from_which_patient_suspected 	 1563
status_change_date 	 27780
current_status 	 27891
estimated_onset_date 	 0
source1 	 27507
source2 	 3873
source3 	 452
notes 	 26555
backup_notes 	 361


In [45]:
# replacing empty strings with np.nan
# ==================================-

print(df.shape)

df = df.replace(r'', np.nan, regex=True)
df.isna().sum()

(27891, 21)


patient_number                                 0
p_id                                           0
state_patient_number                       24894
date_announced                                 0
age_bracket                                25545
gender                                     22578
detected_city                              25544
detected_district                           7640
detected_state                                 1
state_code                                     1
nationality                                25184
type_of_transmission                       24901
contracted_from_which_patient_suspected    26328
status_change_date                           111
current_status                                 0
estimated_onset_date                       27891
source1                                      384
source2                                    24018
source3                                    27439
notes                                       1336
backup_notes        

In [46]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(df.shape)

df.dropna(subset=['detected_state'], inplace=True)

print(df.shape)
df.isna().sum()

(27891, 21)
(27890, 21)


patient_number                                 0
p_id                                           0
state_patient_number                       24893
date_announced                                 0
age_bracket                                25544
gender                                     22577
detected_city                              25543
detected_district                           7639
detected_state                                 0
state_code                                     0
nationality                                25183
type_of_transmission                       24900
contracted_from_which_patient_suspected    26327
status_change_date                           110
current_status                                 0
estimated_onset_date                       27890
source1                                      384
source2                                    24017
source3                                    27438
notes                                       1335
backup_notes        

## Save data

In [47]:
# save to csv`
df.to_csv('patients_data.csv', index=False)

## State wise Daily

In [48]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

# df = pd.DataFrame(parsed['states_daily'])

In [49]:
# df = df.melt(id_vars = ['date', 'status'], 
#              value_vars = ['an', 'ap', 'ar', 'as', 'br', 'ch', 'ct', 'dd', 
#                     'dl', 'dn', 'ga', 'gj', 'hp', 'hr', 'jh', 'jk', 
#                     'ka', 'kl', 'la', 'ld', 'mh', 'ml', 'mn', 'mp',
#                     'mz', 'nl', 'or', 'pb', 'py', 'rj', 'sk', 'tg', 
#                     'tn', 'tr', 'tt', 'up', 'ut', 'wb'], 
#              var_name='state', value_name='count')

# df = df.set_index(['date', 'state'])

# df = df.pivot(columns = 'status').reset_index()

# df.columns = df.columns.droplevel(0)
# df.columns.name = ''

# df.columns = ['Date', 'State', 'Confirmed', 'Deceased', 'Recovered']
# df.head()

In [50]:
# response = requests.get('https://api.covid19india.org/csv/')
# parsed = response.content.decode('utf-8')
# parsed

# df = pd.DataFrame(parsed, sep=',')
# df.head()

In [51]:
# pd.DataFrame('http://api.covid19india.org/states_daily_csv/confirmed.csv')

In [52]:
# pd.read_csv('https://api.covid19india.org/csv/')

## States Daily changes

In [53]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

In [54]:
# pd.DataFrame(parsed['states_daily'])

## National time series, statewise stats and test counts

In [55]:
# response = requests.get('https://api.covid19india.org/data.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [56]:
# day_wise = pd.DataFrame(parsed['cases_time_series'])
# day_wise.head()

In [57]:
# state_wise = pd.DataFrame(parsed['statewise'])
# state_wise.head()

In [58]:
# tested = pd.DataFrame(parsed['tested'])
# tested.head()

## District wise

In [59]:
# response = requests.get('https://api.covid19india.org/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [60]:
# pd.DataFrame(parsed['Goa'])

In [61]:
# parsed['Goa'].keys()

In [62]:
# pd.DataFrame(parsed['Goa']['districtData'])

## District wise v2

In [63]:
# response = requests.get('https://api.covid19india.org/v2/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# len(parsed)

In [64]:
# pd.DataFrame(parsed[1]['districtData'])

## Travel history (no more updated)

In [65]:
# response = requests.get('https://api.covid19india.org/travel_history.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [66]:
# th = pd.DataFrame(parsed['travel_history'])
# th.head()