In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
df1 = pd.read_csv("./Data/StatewiseTestingDetails.csv")
df2 = pd.read_csv("./Data/covid_19_india.csv")

In [3]:
df1.head()

Unnamed: 0,Date,State,TotalSamples,Negative,Positive
0,2020-04-17,Andaman and Nicobar Islands,1403.0,1210.0,12.0
1,2020-04-24,Andaman and Nicobar Islands,2679.0,,27.0
2,2020-04-27,Andaman and Nicobar Islands,2848.0,,33.0
3,2020-05-01,Andaman and Nicobar Islands,3754.0,,33.0
4,2020-05-16,Andaman and Nicobar Islands,6677.0,,33.0


In [4]:
df2.head()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,30/01/20,6:00 PM,Kerala,1,0,0,0,1
1,2,31/01/20,6:00 PM,Kerala,1,0,0,0,1
2,3,01/02/20,6:00 PM,Kerala,2,0,0,0,2
3,4,02/02/20,6:00 PM,Kerala,3,0,0,0,3
4,5,03/02/20,6:00 PM,Kerala,3,0,0,0,3


In [5]:
# checking the amount of data available
df2[df2["State/UnionTerritory"]=="Kerala"].head(252)

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,30/01/20,6:00 PM,Kerala,1,0,0,0,1
1,2,31/01/20,6:00 PM,Kerala,1,0,0,0,1
2,3,01/02/20,6:00 PM,Kerala,2,0,0,0,2
3,4,02/02/20,6:00 PM,Kerala,3,0,0,0,3
4,5,03/02/20,6:00 PM,Kerala,3,0,0,0,3
...,...,...,...,...,...,...,...,...,...
6927,6928,03/10/20,8:00 AM,Kerala,-,-,135144,791,213499
6962,6963,04/10/20,8:00 AM,Kerala,-,-,139620,813,221333
6997,6998,05/10/20,8:00 AM,Kerala,-,-,144471,836,229886
7032,7033,06/10/20,8:00 AM,Kerala,-,-,149111,859,234928


In [6]:
# getting list of unique states and union territories
state_ut_names = list(set(df2["State/UnionTerritory"]))
print(state_ut_names)
print((len(state_ut_names)))

['Andhra Pradesh', 'Dadra and Nagar Haveli and Daman and Diu', 'Himachal Pradesh', 'Odisha', 'Madhya Pradesh', 'Nagaland', 'Manipur', 'Delhi', 'Mizoram', 'Telangana***', 'Gujarat', 'Maharashtra', 'Andaman and Nicobar Islands', 'Jammu and Kashmir', 'Kerala', 'Meghalaya', 'Goa', 'West Bengal', 'Ladakh', 'Uttarakhand', 'Haryana', 'Karnataka', 'Puducherry', 'Assam', 'Telengana***', 'Rajasthan', 'Unassigned', 'Uttar Pradesh', 'Jharkhand', 'Arunachal Pradesh', 'Cases being reassigned to states', 'Chhattisgarh', 'Tamil Nadu', 'Tripura', 'Dadar Nagar Haveli', 'Daman & Diu', 'Telengana', 'Sikkim', 'Bihar', 'Telangana', 'Chandigarh', 'Punjab']
42


# Cleaning up the dataset

In [7]:
df2 = df2.replace(to_replace=r'^Tel.*$', value='Telangana', regex=True)
df2 = df2.replace(to_replace=r'.*Daman.*$', value='Daman & Diu', regex=True)


In [8]:
state_ut_names = list(set(df2["State/UnionTerritory"]))
print(state_ut_names)
print((len(state_ut_names))) 
# two extra categories have been added which will be dropped
state_ut_names.remove("Unassigned")
state_ut_names.remove("Cases being reassigned to states")

['Andhra Pradesh', 'Himachal Pradesh', 'Odisha', 'Madhya Pradesh', 'Nagaland', 'Manipur', 'Delhi', 'Mizoram', 'Gujarat', 'Maharashtra', 'Andaman and Nicobar Islands', 'Jammu and Kashmir', 'Kerala', 'Meghalaya', 'Goa', 'West Bengal', 'Ladakh', 'Uttarakhand', 'Haryana', 'Karnataka', 'Puducherry', 'Assam', 'Rajasthan', 'Unassigned', 'Uttar Pradesh', 'Jharkhand', 'Arunachal Pradesh', 'Cases being reassigned to states', 'Chhattisgarh', 'Tamil Nadu', 'Tripura', 'Dadar Nagar Haveli', 'Daman & Diu', 'Sikkim', 'Bihar', 'Telangana', 'Chandigarh', 'Punjab']
38


In [9]:
print(state_ut_names)
print((len(state_ut_names))) # 28 states + 8 union territories 

['Andhra Pradesh', 'Himachal Pradesh', 'Odisha', 'Madhya Pradesh', 'Nagaland', 'Manipur', 'Delhi', 'Mizoram', 'Gujarat', 'Maharashtra', 'Andaman and Nicobar Islands', 'Jammu and Kashmir', 'Kerala', 'Meghalaya', 'Goa', 'West Bengal', 'Ladakh', 'Uttarakhand', 'Haryana', 'Karnataka', 'Puducherry', 'Assam', 'Rajasthan', 'Uttar Pradesh', 'Jharkhand', 'Arunachal Pradesh', 'Chhattisgarh', 'Tamil Nadu', 'Tripura', 'Dadar Nagar Haveli', 'Daman & Diu', 'Sikkim', 'Bihar', 'Telangana', 'Chandigarh', 'Punjab']
36


In [10]:
# saving cleaned dataset
df2 = df2[df2["State/UnionTerritory"].isin(state_ut_names)]
selected_columns = ["Date","State/UnionTerritory","Cured","Deaths","Confirmed"]
df2 = df2[selected_columns]
df2.to_csv('./Data/covid_19_india_cleaned.csv') 
df2.head()


Unnamed: 0,Date,State/UnionTerritory,Cured,Deaths,Confirmed
0,30/01/20,Kerala,0,0,1
1,31/01/20,Kerala,0,0,1
2,01/02/20,Kerala,0,0,2
3,02/02/20,Kerala,0,0,3
4,03/02/20,Kerala,0,0,3


# calculating trendline coefficient

In [11]:
# setup
cured_coeff = {
    "State/UnionTerrirory":[],
    "TrendLineCoeff":[]
}
deaths_coeff = {
    "State/UnionTerrirory":[],
    "TrendLineCoeff":[]
}
confirmed_coeff = {
    "State/UnionTerrirory":[],
    "TrendLineCoeff":[]
}

In [12]:
# calculating cured coeff:
for state_ut in state_ut_names:
    data = df2[df2["State/UnionTerritory"]==state_ut]
    data = data[["Date","Cured"]]
    y = np.array(data["Cured"])
    x = np.array([i for i in range(len(y))]).reshape(-1,1)
    reg = LinearRegression().fit(x, y)
    cured_coeff["State/UnionTerrirory"].append(state_ut)
    cured_coeff["TrendLineCoeff"].append(reg.coef_.item())

In [15]:
# calculating death coeff:
for state_ut in state_ut_names:
    data = df2[df2["State/UnionTerritory"]==state_ut]
    data = data[["Date","Deaths"]]
    y = np.array(data["Deaths"])
    x = np.array([i for i in range(len(y))]).reshape(-1,1)
    reg = LinearRegression().fit(x, y)
    deaths_coeff["State/UnionTerrirory"].append(state_ut)
    deaths_coeff["TrendLineCoeff"].append(reg.coef_.item())

In [17]:
# calculating confirmed coeff:
for state_ut in state_ut_names:
    data = df2[df2["State/UnionTerritory"]==state_ut]
    data = data[["Date","Confirmed"]]
    y = np.array(data["Confirmed"])
    x = np.array([i for i in range(len(y))]).reshape(-1,1)
    reg = LinearRegression().fit(x, y)
    confirmed_coeff["State/UnionTerrirory"].append(state_ut)
    confirmed_coeff["TrendLineCoeff"].append(reg.coef_.item())

In [20]:
df_1 = pd.DataFrame.from_dict(confirmed_coeff) 
df_2 = pd.DataFrame.from_dict(deaths_coeff) 
df_3 = pd.DataFrame.from_dict(cured_coeff) 

In [23]:
df_1.to_csv('./Data/confirmed_trendLineCoeff.csv') 
df_2.to_csv('./Data/deaths_trendLineCoeff.csv') 
df_3.to_csv('./Data/cured_trendLineCoeff.csv') 