In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

import statsmodels.api as sm  
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 100)
pd.set_option('max_info_columns',200)

In [None]:
covid19_district_data = pd.read_csv("https://api.covid19india.org/csv/latest/districts.csv")

In [None]:
covid19_district_data.info()

In [None]:
covid19_district_data['District'].nunique()

In [None]:
covid19_district_data.head()

In [None]:
from datetime import datetime, timedelta
today = datetime.today().strftime("%Y-%m-%d")

In [None]:
covid_19_district_agg = covid19_district_data[(covid19_district_data['Date'] == today)][['District','Confirmed','Recovered','Deceased','Other','Tested']]

In [None]:
covid_19_district_agg.head()

In [None]:
covid_19_district_agg['Confirmed'].sum()

## Census 2011

In [None]:
census_2011 = pd.read_csv("../input/india-census/india-districts-census-2011.csv",engine='python')

In [None]:
census_2011.head()

## Mapping changed/misspelled district names correctly in the census and covid dataset.

In [None]:
old_dict = {"Ahmedabad":"Ahmadabad","Ahmednagar":"Ahmadnagar","Amroha":"Jyotiba Phule Nagar","Angul":"Anugul","Ayodhya":"Faizabad","Bagalkote" :"Bagalkot","Balasore":"Baleshwar","Ballari":"Bellary","Banaskantha":"Banas Kantha","Bandipora":"Bandipore","Barabanki":"Bara Banki","Baramulla":"Baramula","Beed":"Bid","Belagavi":"Belgaum","Bengaluru Rural":"Bangalore Rural","Bengaluru Urban":"Bangalore","Bhadohi":"Sant Ravidas Nagar (Bhadohi)","Boudh":"Baudh","Budgam":"Badgam","Buldhana":"Buldana","Chamarajanagara":"Chamarajanagar","Dadra and Nagar Haveli":"Dadra AND Nagar Haveli","Dahod":"Dohad","Dang":"The Dangs","Darjeeling":"Darjiling","Delhi":"New Delhi","Deogarh":"Deoghar","Dholpur":"Dhaulpur","East Champaran":"Purba Champaran","East Sikkim":"East District","East Singhbhum":"Purbi Singhbhum","Ferozepur":"Firozpur","Gondia":"Gondiya","Gurugram":"Gurgaon","Haridwar":"Hardwar","Hathras":"Mahamaya Nagar","Hooghly":"Hugli","Howrah":"Haora","Jagatsinghpur":"Jagatsinghapur","Jajpur":"Jajapur","Jalore":"Jalor","Janjgir Champa":"Janjgir - Champa","Jhunjhunu":"Jhunjhunun","Kaimur":"Kaimur (Bhabua)","Kalaburagi":"Gulbarga","Kanyakumari":"Kanniyakumari","Kasganj":"Kanshiram Nagar","Khandwa":"Khandwa (East Nimar),","Khargone":"Khargone (West Nimar)","Koderma":"Kodarma","Kutch":"Kachchh","Lahaul and Spiti":"Lahul AND Spiti","Lakhimpur Kheri":"Kheri","Leh":"Leh(Ladakh)","Maharajganj":"Mahrajganj","Malda":"Maldah","Mehsana":"Mahesana","Mysuru":"Mysore","Narsinghpur":"Narsimhapur","Nilgiris":"The Nilgiris","North 24 Parganas":"North Twenty Four Parganas","North Sikkim":"North District","Nuh":"Mewat","Panchmahal":"Panch Mahals","Pauri Garhwal":"Garhwal","Prayagraj":"Allahabad","Puducherry":"PONDICHERRY","Purulia":"Puruliya","Raigad":"Raigarh","S.A.S. Nagar":"Sahibzada Ajit Singh Nagar","S.P.S. Nellore":"Sri Potti Sriramulu Nellore","Sabarkantha":"Sabar Kantha","Shivamogga":"Shimoga","Shopiyan":"Shupiyan","South 24 Parganas":"South Twenty Four Parganas","South Sikkim":"South District","Sri Muktsar Sahib":"Muktsar","Tengnoupal":"Chandel","Tumakuru":"Tumkur","Vijayapura":"Bijapur","West Champaran":"Pashchim Champaran","West Sikkim":"West District","West Singhbhum":"Pashchimi Singhbhum","Y.S.R. Kadap":"Y.S.R"}

In [None]:
new_dict = dict([(value, key) for key, value in old_dict.items()]) 

In [None]:
census_2011['District name'].replace(new_dict,inplace=True)

In [None]:
final_df = pd.merge(census_2011, covid_19_district_agg, left_on=['District name'], right_on=['District'],how='inner')

## Dropping some columns

In [None]:
final_df.drop(['District code','State name','District'],axis=1,inplace=True)

In [None]:
final_df.drop(['Recovered','Deceased','Other','Tested'],axis=1,inplace=True)

In [None]:
final_df.head()

In [None]:
final_df.shape

## Removing skeweness of target column

In [None]:
sns.distplot(final_df['Confirmed'],)

In [None]:
pd.Series(final_df['Confirmed']).skew()

In [None]:
transform_confirmed_case = np.log(final_df['Confirmed']+1)

pd.Series(transform_confirmed_case).skew()

In [None]:
sns.distplot(transform_confirmed_case)

In [None]:
final_df['Confirmed'] = np.log(final_df['Confirmed']+1)

In [None]:
df = final_df.drop('District name',axis=1)

In [None]:
final_df.head()

## Doing log transformation for each column 

In [None]:
for col in df.columns[:-1]:
    df[col] = np.log(df[col]+1)

# Model Building

In [None]:
np.random.seed(0)
df_train, df_test = train_test_split(df, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
df_train.shape

In [None]:
scale_cols = [col for col in df_train.columns if ((df_train[col].max()) + (df_train[col].min())) > 1]

In [None]:
len(scale_cols)

In [None]:
df_train[scale_cols].describe()

## Scaling

In [None]:
scaler = MinMaxScaler()

In [None]:
df_train[scale_cols] = scaler.fit_transform(df_train[scale_cols])

In [None]:
df_train.describe()

### Divide into X and Y

In [None]:
y_train = df_train.pop('Confirmed')

In [None]:
X_train = df_train

In [None]:
X_train.shape

## RFE for feature selection

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 10)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
col = X_train.columns[rfe.support_]
col

### Building Model using Linear regression

In [None]:
X_train_rfe = X_train[col]

In [None]:
# linear regression
lm = LinearRegression()
lm.fit(X_train_rfe, y_train)

# predict
y_train_pred = lm.predict(X_train_rfe)
round(metrics.r2_score(y_true=y_train, y_pred=y_train_pred),2)

## Making Prediction

In [None]:
df_test[scale_cols] = scaler.transform(df_test[scale_cols])

In [None]:
y_test = df_test.pop('Confirmed')

In [None]:
X_test = df_test[col]

In [None]:
X_test.describe()

In [None]:
y_pred = lm.predict(X_test)
r_squared = metrics.r2_score(y_test, y_pred)
round(r_squared,2)

Linear Regression model parameters

In [None]:

model_parameters = list(lm.coef_)
model_parameters.insert(0, lm.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X_train_rfe.columns
cols = cols.insert(0, "constant")
final_list = [i for i in list(zip(cols, model_parameters)) if i[1] != 0]
sorted(final_list, key = lambda x: x[1]) 

# Advance Regression
## Lasso

In [None]:
X_train_rfe.shape

In [None]:
X_test.shape

In [None]:
X_train_rfe.describe()

## GridSeachCV to find optimal hyper-parameter

In [None]:
# set up cross validation scheme
l_folds = KFold(n_splits = 5, shuffle = True, random_state = 101)

# specify range of hyperparameters
l_params = {'alpha': [0.0001,0.0004,0.0005,0.0008,0.001,0.01, 1.0, 5.0, 10.0]}

# grid search
# lasso model
l_model = Lasso(max_iter=1000000)
l_model_cv = GridSearchCV(estimator = l_model, param_grid = l_params, 
                        scoring= 'r2', 
                        cv = l_folds, 
                        return_train_score=True,
                          verbose = 1)            
l_model_cv.fit(X_train_rfe, y_train) 

In [None]:
l_cv_results = pd.DataFrame(l_model_cv.cv_results_)

In [None]:
l_cv_results['test_train_diff'] = l_cv_results['mean_train_score'] - l_cv_results['mean_test_score']

l_cv_results[['param_alpha','mean_test_score','mean_train_score','test_train_diff']]

In [None]:
# plot
l_cv_results['param_alpha'] = l_cv_results['param_alpha'].astype('float32')
plt.plot(l_cv_results['param_alpha'], l_cv_results['mean_train_score'])
plt.plot(l_cv_results['param_alpha'], l_cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('r2 score')
plt.xscale('log')
plt.show()

In [None]:
l_model_cv.best_estimator_.alpha

Running lasso using best optimal parameter

In [None]:
from time import time

In [None]:
lm_lasso = Lasso(alpha=l_model_cv.best_estimator_.alpha,max_iter=1000000)
t0=time()
lm_lasso.fit(X_train_rfe, y_train)
print ("training time:", round(time()-t0, 3), "s")
# predict
y_train_pred = lm_lasso.predict(X_train_rfe)
print("train accuracy:",round(metrics.r2_score(y_true=y_train, y_pred=y_train_pred),2))
t1=time()
y_test_pred = lm_lasso.predict(X_test)
print ("predict time:", round(time()-t1, 3), "s")
print("test accuracy:",round(metrics.r2_score(y_true=y_test, y_pred=y_test_pred),2))

In [None]:
# lasso model parameters
model_parameters = list(lm_lasso.coef_)
model_parameters.insert(0, lm_lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X_train_rfe.columns
cols = cols.insert(0, "constant")
final_list = [i for i in list(zip(cols, model_parameters)) if i[1] != 0]

In [None]:
len(final_list)

In [None]:
sorted(final_list, key = lambda x: x[1])

## Ridge

In [None]:
# set up cross validation scheme
folds = KFold(n_splits = 5, shuffle = True, random_state = 105)

# specify range of hyperparameters
params = {'alpha': [0.0001,0.0004,0.0005,0.0008,0.001,0.01, 1.0, 5.0, 10.0,50.0,100.0]}

# grid search
# lasso model
model = Ridge()
model_cv = GridSearchCV(estimator = model, param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True, verbose = 1)            
model_cv.fit(X_train_rfe, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)

In [None]:
cv_results['test_train_diff'] = cv_results['mean_train_score'] - cv_results['mean_test_score']

cv_results[['param_alpha','mean_test_score','mean_train_score','test_train_diff']]

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('r2 score')
plt.title("r2 score and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
model_cv.best_estimator_.alpha

Running ridge using best optimal parameter.

In [None]:
#alpha = 1000
ridge = Ridge(alpha=model_cv.best_estimator_.alpha)

ridge_lm = ridge.fit(X_train_rfe, y_train)
# predict
y_train_pred = ridge_lm.predict(X_train_rfe)
print(round(metrics.r2_score(y_true=y_train, y_pred=y_train_pred),2))
y_test_pred = ridge_lm.predict(X_test)
print(round(metrics.r2_score(y_true=y_test, y_pred=y_test_pred),2))

# Top Census's variable influencing covid19 in India(sorted order) are:
* Ownership_Owned_Households ---------- How many people own the house they live in -------- Negatively Influence
* Population -------------------------- District Population -------------------------------- Positively Influence
* Households_with_Internet ------------ House having internet ----------------------------- Positively Influence