# Covid-19 predictor using Linear Regression and Random Forest Regression

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

### 1. Load dataset from -  https://covid.ourworldindata.org/data/owid-covid-data.csv

In [None]:
df1=pd.read_csv("../input/coviddata/owid-covid-data.csv")

In [None]:
df1.head()

In [None]:
df1.info()

### 2. Subset only those rows that have “India” in the “location” column. This subsetted dataframe will be used for modelling.

In [None]:
print("There are",(df1['location']=='India').sum(),"entries of India in the location column in the dataset.")

In [None]:
print("There are",df1['location'].isnull().sum(),"null values in the location column.")

In [None]:
print(((df1['location']=='India').sum())/(df1['location'].notnull().sum())*100,"% of rows contain India in the location column.")

In [None]:
df=df1.copy()

In [None]:
df.drop(df[df['location']!='India'].index,inplace=True)

In [None]:
df.head()

In [None]:
ProfileReport(df)

### 3. Univariate Analysis:

#### a. Find mean, median and mode of each column

In [None]:
var_num=df.select_dtypes(exclude=['object']).columns.tolist()
var_num

In [None]:
dict1={}
dict2={}
list1=[]
for i in var_num:
    dict2['mean']=df[i].mean()
    dict2['median']=df[i].median()
    dict2['mode']=df[i].value_counts().index[0]
    dict2['min']=df[i].min()
    dict2['max']=df[i].max()
    range=(df[i].max())-(df[i].min())
    if range!=0:
        list1.append(i)
    dict1[i]=dict2
    dict2={}

In [None]:
import json
json_object = json.dumps(dict1)
df2=pd.read_json(json_object)

In [None]:
df2.head()

In [None]:
print("The list of numeric features without constant value is :\n",list1)

#### b. Draw histograms of each numerical variable

In [None]:
for z in list1:
    print(z)
    plt.figure(z)
    plt.hist(df[z],bins=10)
    plt.show()

### 4. Bivariate Analysis:

#### a. Draw scatter plots of each numerical column versus one another

In [None]:
for i in list1:
    for j in list1:
        if i!=j:
            plt.figure(i)
            sns.scatterplot(df[i],df[j])

#### b. Draw line plots of each numerical column versus one another

In [None]:
for i in list1:
    for j in list1:
        if i!=j:
            plt.figure(i)
            sns.lineplot(df[i],df[j])

### 5. Handle Missing values:

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
feature_with_na=[feature for feature in df.columns if df[feature].isnull().sum()>=1]
feature_with_na

In [None]:
for i in feature_with_na:
    print(i,":",np.round(df[i].isnull().mean(),4)*100,"% missing values")

#### a. If there are null values in numerical column, replace the null values by the mean of that column

In [None]:
numerical_with_na=[]
for j in feature_with_na:
    if df[j].dtypes!='O':
        numerical_with_na.append(j)

In [None]:
numerical_with_na

In [None]:
for i in numerical_with_na:
    df[i]=df[i].fillna(df[i].mean())

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

#### b. If there are null values in categorical column, replace the null values by the mode of that column

In [None]:
categorical_with_na=[]
for j in feature_with_na:
    if df[j].dtypes=='O':
        categorical_with_na.append(j)

In [None]:
print("Categorical features with null values are :\n",categorical_with_na)

In [None]:
for i in categorical_with_na:
    df[i]=df[i].fillna(df[i].value_counts().index[0])

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

### 6. Convert date column to ordinal

In [None]:
import datetime as dt
df['date']=pd.to_datetime(df["date"]) 
df["date"]=df["date"].map(dt.datetime.toordinal)

In [None]:
df.head()

### 7. Drop all categorical columns

In [None]:
var_cat=df.select_dtypes(include=['object']).columns.tolist()
var_cat

In [None]:
df3=df.drop(var_cat,axis=1)
df3.head()

### 8. Select “total_cases” column as the target variable

In [None]:
y=df3['total_cases'].values

### 9. Select the other columns as the features(the “date” column has to be in the features)

In [None]:
df4=df3.drop('total_cases',axis=1)
X=df4.values

### 10. Perform train-test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)

### 11. Modelling:

#### a. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  
regressor.fit(X_train, y_train) 

In [None]:
y_pred = regressor.predict(X_test)

#### b. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=500,random_state=0)
model.fit(X_train, y_train) 

In [None]:
y_pred_rf=model.predict(X_test)

### 12. Get accuracy

In [None]:
print("Accuracy for Linear Regression :\n")
from sklearn.metrics import classification_report,accuracy_score
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#visualize comparison result as a bar graph
df5 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
dfpred = df5.head(50)
dfpred.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
print("Accuracy for Random Forest Regression :\n")
from sklearn.metrics import classification_report,accuracy_score
from sklearn import metrics
print('Root Mean Squared Error:', metrics.r2_score(y_test,y_pred_rf))
#visualize comparison
df6 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred_rf.flatten()})
dfpred = df6.head(50)
dfpred.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()