### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Read Data

In [None]:
sub=pd.read_csv("../input/covid19-local-us-ca-forecasting-week-1/ca_submission.csv")
test=pd.read_csv("../input/covid19-local-us-ca-forecasting-week-1/ca_test.csv")
train=pd.read_csv("../input/covid19-local-us-ca-forecasting-week-1/ca_train.csv")

### EDA train

In [None]:
print(train.shape)
train.head()

In [None]:
#Only taking data with confirmed cases
train=train[train.ConfirmedCases>0]
print(train.shape)
train.head()

### Visualization ID vs confirmed cases

In [None]:
sns.lineplot(train.Id, train.ConfirmedCases)

### Visualization Id vs log(Confirmed Cases)

In [None]:
sns.regplot(train.Id, np.log(train.ConfirmedCases))

### Evaluation of Regression fit

In [None]:
model_1= LinearRegression()
x1=np.array(train.Id).reshape(-1,1)
y1=np.log(train.ConfirmedCases)
model_1.fit(x1,y1)
print("R-squared score : ",model_1.score(x1,y1))

gr=np.power(np.e, model_1.coef_[0])
print("Growth Factor : ", gr)
print(f"Growth Rate : {round((gr-1)*100,2)}%")

* R-squared score of 0.9968980433945356 shows that confirmed cases is increasing exponentially with index(date)
* Growth rate for the period is 22.64%

### Confirmed Cases vs Fatalities

In [None]:
sns.regplot(train.ConfirmedCases,train.Fatalities)

### Evaluation of Regression fit

In [None]:
model_2= LinearRegression()
x2=np.array(train.ConfirmedCases).reshape(-1,1)
y2=train.Fatalities
model_2.fit(x2,y2)
print("R-Squared Score= ",model_2.score(x2,y2))

From the train data, number of fatalities varies directly proportional to Number of Confirmed cases.
* Further exploration needs to be done to check how it behaves when confirmed cases overwhelms the health infrastructure.
* This can be done using dataset of heavily affected areas with comparable health infrastructure like Italy, France, Spain etc.

### EDA test data

In [None]:
test.head()

In [None]:
#Making Id as unique key between test and train
test["Id"]=50+test.ForecastId
test.head()

### Predicting the Confirmed Cases and Fatalities for test data

*Assumption* 
* Rate of increase in confirmed cases for ensuing period = 22.64%
* Fatalities change linearly with confirmed cases

In [None]:
test["LogConf"]=model_1.predict(np.array(test.Id).reshape(-1,1))
test["ConfirmedCases"]=np.exp(test.LogConf)//1
test["Fatalities"]=model_2.predict(np.array(test.ConfirmedCases).reshape(-1,1))//1
test

### What is the point of prediting the past!

In [None]:
#Wherever confirmed cases and fatalities are available in train data, update it into test data
for id in train.Id:
    test.ConfirmedCases[test.Id==id]=train.ConfirmedCases[train.Id==id].sum()
    test.Fatalities[test.Id==id]=train.Fatalities[train.Id==id].sum()
test

In [None]:
### Prepare submission file

In [None]:
sub.ConfirmedCases=test.ConfirmedCases
sub.Fatalities=test.Fatalities
sub.to_csv("submission.csv", index=False)