In [None]:
#libraries importing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 
import matplotlib.pyplot as plt #Visualization
import types
from botocore.client import Config

#---------------------Data importing

In [None]:
#Data set import 


data = pd.read_csv("../input/novel-corona-virus-2019-dataset/covid_19_data.csv")
data.head()


In [None]:
data.shape

In [None]:
#dataset summary 
data.info()
#3  float64 columns and 4 categorical culumns

In [None]:
#dataset summary 
data.describe()

#---------------------Data Exploration-cleansing-converting-Correcting Completing

In [None]:
#convert Confirmed	Deaths	Recovered columns to int values
data['Confirmed']=data['Confirmed'].astype(int)
data['Deaths']=data['Deaths'].astype(int)
data['Recovered']=data['Recovered'].astype(int)

#Clean Data: remove serial number column 
data.drop(['SNo'], axis=1, inplace=True)
#Replace Mainland China with China
data.replace({'Country': 'Mainland China'}, 'China', inplace=True)
data.info()

In [None]:
#check if there is empty values
data.isnull().any()


In [None]:
#Find rows with empty values
data[data.isnull().any(axis=1)]


In [None]:
#Most affected country and chinese province
print('Most affected countries are:',data.groupby('Country/Region')['Confirmed'].sum().sort_values(ascending=False))
#Most affected Province/State
print('Most affected countries are:',data.groupby('Province/State')['Confirmed'].sum().sort_values(ascending=False))
print('Most affected country is China with:',data.loc[data['Country/Region'] == 'Mainland China'].sum())

In [None]:
#No.of Countries  affected with Virus
data['Country/Region'].unique()

#---------------------Dataset Visualisation

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.rcParams["figure.figsize"] = (16,9)
plt.figure(figsize=(16,9));


In [None]:
data[['Confirmed', 'Deaths', 'Recovered']].sum().plot(kind='bar')

In [None]:
plt.figure(figsize=(12,7))
chart = sns.countplot(data=data, x='Country/Region', palette='Set1')
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', fontweight='light');

In [None]:
#data[data['Country/Region'] != 'Mainland China'].groupby('ObservationDate').max().plot(kind='line') #except China

In [None]:
#No of persons with confirmed virus is icreasing, Good news: No of death is stable.

In [None]:
data.loc[data['Country/Region'] == 'Mainland China'].groupby('ObservationDate').max().plot(kind='line') #In China

#---------------------Prediction --

In [None]:
train=int(data.groupby('ObservationDate')['Confirmed'].sum().reset_index().shape[0] * 0.8)
test=data.groupby('ObservationDate')['Confirmed'].sum().reset_index().shape[0]-train

In [None]:
confirmed_training_dataset = pd.DataFrame(data.groupby('ObservationDate')['Confirmed'].sum().reset_index()).rename(columns={'Date': 'x', 'Confirmed': 'y'})
X_confirmed = np.array(confirmed_training_dataset.y)
#X_confirmed=X_confirmed[:, None]
X_confirmed_train=X_confirmed[:train, None]
X_confirmed_test=X_confirmed[test:, None]

In [None]:
death_training_dataset = pd.DataFrame(data.groupby('ObservationDate')['Deaths'].sum().reset_index()).rename(columns={'Date': 'x', 'Deaths': 'y'})
X_death = np.array(death_training_dataset.y)
#X_death=X_death[:, None]
X_death_train=X_death[:train, None]
X_death_test=X_death[test:, None]

In [None]:
recovered_training_dataset = pd.DataFrame(data.groupby('ObservationDate')['Recovered'].sum().reset_index()).rename(columns={'Date': 'x', 'Recovered': 'y'})
X_recovered = np.array(recovered_training_dataset.y)
#X_recovered=X_recovered[:, None]
X_recovered_train=X_recovered[:train, None]
X_recovered_test=X_recovered[test:, None]


In [None]:
#Visualization of cloud points Confirmed Vs Death
plt.scatter(X_confirmed,X_death)

plt.title('Cloud points: Confirmed Vs Death ')
plt.xlabel('Confirmed')
plt.ylabel('Death')

plt.show()

In [None]:
#Visualization of cloud points Confirmed Vs Recovered
plt.scatter(X_confirmed,X_recovered)
plt.title('Cloud points: Confirmed Vs recovered ')
plt.xlabel('Confirmed')
plt.ylabel('Recovered')
plt.show()

In [None]:
# Fit linear model
from sklearn.linear_model import LinearRegression
reg_death1 = LinearRegression().fit(X_confirmed_train, X_death_train)
reg_recovered1 = LinearRegression().fit(X_confirmed_train, X_recovered_train)

In [None]:
# Fit Ridge Regression model
from sklearn import linear_model
reg_death2 = linear_model.Ridge(alpha=.5).fit(X_confirmed_train, X_death_train)
reg_recovered2 = linear_model.Ridge(alpha=.5).fit(X_confirmed_train, X_recovered_train)

In [None]:
# Fit LARS Lasso model
from sklearn import linear_model
reg_death3 = linear_model.LassoLars(alpha=.1).fit(X_confirmed_train, X_death_train)
reg_recovered3 = linear_model.LassoLars(alpha=.1).fit(X_confirmed_train, X_recovered_train)

In [None]:
#Prediction
reg_death1.predict(X_death_test)

In [None]:
reg_recovered1.predict(X_recovered_test)

In [None]:
## Plot outputs
plt.title('Prediction: Confirmed Vs Recovered ')
plt.scatter(X_recovered_test, X_recovered_test,  color='black')
plt.plot(X_recovered_test, X_recovered_test, color='green', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
## Plot outputs
plt.title('Prediction: Confirmed Vs Recovered ')
plt.scatter(X_confirmed_test, X_death_test,  color='black')
plt.plot(X_confirmed_test, X_death_test, color='red', linewidth=3)

#plt.xticks(())
#plt.yticks(())

plt.show()

In [None]:
reg_death.predict([[1000000],])#predict the death rate when No. of persons with confirmed coronavirus is 1M

In [None]:
reg_recovered.predict([[1000000],])#predict the recovering rate when No. of persons with confirmed coronavirus is 1M

#Good predicted news!!! 