## A Much more improved model
    

# 1. Problem Description

In [None]:
#improve our model by a greater extend finding the best method
#Using different Regression Models like Random Forest Regression / Decision Trees Regreesion / Linear Regression / 

# 2. Existing Model

In [None]:
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns
import datetime as dt
import numpy as np
%matplotlib inline

In [None]:
df = pd.read_csv('../input/covid19-in-india/covid_19_india.csv', parse_dates=['Date'], dayfirst=True)
df.head()

In [None]:
df = df[['Date', 'State/UnionTerritory','Cured','Deaths','Confirmed']]
df.columns = ['date', 'state','cured','deaths','confirmed']
df.head()

In [None]:
#lets check for a date
august = df[df.date == '2020-08-09']
august.head()

In [None]:
max_confirmed_cases=august.sort_values(by="confirmed",ascending=False)
max_confirmed_cases.head()

In [None]:
#Visulizations

In [None]:
#Top 5 states affected
top_states_confirmed=max_confirmed_cases[0:5]

In [None]:
#Barplot
sns.barplot(x="state",y="confirmed",data=top_states_confirmed,hue="state",palette="coolwarm")

In [None]:
max_cases_death=august.sort_values(by="deaths",ascending=False)
max_cases_death.head()

In [None]:
#Top 5 states affected
top_states_death=max_cases_death[0:5]

In [None]:
#Bar Plot
sns.barplot(x="state",y="deaths",data=top_states_death,hue="state",palette="coolwarm")

In [None]:
max_cases_cured=august.sort_values(by="cured",ascending=False)
max_cases_cured.head()

In [None]:
#States with max cured cases
top_states_cured=max_cases_cured[0:5]

In [None]:
#Bar Plot
sns.barplot(x="state",y="cured",data=top_states_cured,hue="state",palette="coolwarm")

Checking how well the predictions are

### Lets Predict for Uttar Pradesh

In [None]:
up = df[df.state == 'Uttar Pradesh']
up.head()

In [None]:
sns.lineplot(x="date",y="confirmed",data=up,color="b")

In [None]:
sns.lineplot(x="date",y="deaths",data=up,color="r")

In [None]:
#making date a feature
pd.options.mode.chained_assignment = None  # default='warn'
import datetime as dt
up['date'] = pd.to_datetime(up['date'])
up['date']=up['date'].map(dt.datetime.toordinal)
up.head()

In [None]:
up.drop('state',axis=1,inplace=True)

In [None]:
#getting dependent variable and inpedent variable
X=up.drop('confirmed',axis=1)
y=up['confirmed']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
predictions=lm.predict(X_test)
plt.scatter(y_test,predictions)

In [None]:
#Checking how well the predictions are
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

### Lets see if we can improve our model

# 3. Proposed Model 

In [None]:
#To improve our previous model

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Using different models
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

classifiers = [
    SGDRegressor(max_iter=100000),
    LinearSVR(max_iter=100000),
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
]

In [None]:
#Evaluating scores by cross-validation
from sklearn.model_selection import cross_val_score
for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    scores = cross_val_score(clf, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print("Mean: ", scores.mean())
    print("Std deviation: ", scores.std())
    
print("="*30)

In [None]:
#Linear Regression has the least mean but lets try with Random Forest Regression as it has least Std deviation.
#if we can change estimators and other parameters to improve our model

In [None]:
#We will use Grid Search to get best settings(estimator) for Random Forest Regression
from sklearn.model_selection import GridSearchCV
#param_grid is for our settings lets see what fits the best
param_grid = [
    {'n_estimators': [3, 10, 30,50,100,200,300]},
      ]
rnd_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rnd_reg, param_grid, cv=10,scoring="neg_mean_squared_error",
                          return_train_score=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

## 4. Result Comparison

In [None]:
#Predictions on best Parameters
best_reg = grid_search.best_estimator_
y_pred = best_reg.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
final_mae=mean_absolute_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)
print("Final MAE:",final_mae)
print("Final MSE:",final_mse)
print("Final RMSE:",final_rmse)

In [None]:
#Our Initial Predictions
print('Initial MAE:', metrics.mean_absolute_error(y_test, predictions))
print('Initial MSE:', metrics.mean_squared_error(y_test, predictions))
print('Initial RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
#Our new model Performs much better 
print('Reduced MAE:',(metrics.mean_absolute_error(y_test, predictions))-final_mae)
print('Reduced MSE:',(metrics.mean_squared_error(y_test, predictions))-final_mse)
print('Reduced RMSE:',np.sqrt(metrics.mean_squared_error(y_test, predictions))-final_rmse)

In [None]:
print("Our original model mean: ",up['confirmed'].mean())
print("Our proposed model mean: ",predictions.mean())

In [None]:
# It seems pretty good!

In [None]:
#lets make a scatter plot for our new model
plt.scatter(y_test,y_pred)

In [None]:
plt.figure(figsize=(20,8))
sns.distplot((y_test-y_pred),bins=200);

 # 5. Conclusion

In [None]:
# After using the below function 

# from sklearn.model_selection import cross_val_score
# for clf in classifiers:
#     clf.fit(X_train, y_train)
#     name = clf.__class__.__name__
#     print("="*30)
#     print(name)
#     scores = cross_val_score(clf, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
#     rmse_scores = np.sqrt(-scores)
#     print("Mean: ", scores.mean())
#     print("Std deviation: ", scores.std())
    
# print("="*30)

#Linear Regression seems to perform better than Random Forest Regression
#But after adjucting some parameters Random Forest Regression performs much better than that!

# 6. Future Direction

In [None]:
#Tweak some more settings in Random Forest Regression like finding out the best estimator using the code below

In [None]:
from sklearn.model_selection import GridSearchCV
#param_grid is for our settings lets see what fits the best
param_grid = [
    {'n_estimators': [i for i in range(100,250,10)]},
      ]
rnd_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rnd_reg, param_grid, cv=10,scoring="neg_mean_squared_error",
                          return_train_score=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
#Seems we found a better estimator previously it was 200
#likewise we can add more settings like max_features to improve our model