In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dataset = pd.read_csv("/kaggle/input/sars-outbreak-2003-complete-dataset/sars_2003_complete_dataset_clean.csv")
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset = dataset.rename(columns={'Cumulative number of case(s)': 'Cumulative_Cases', 'Number of deaths': 'Death_Count', 'Number recovered': 'Recovered_Count'})
dataset.head()

In [None]:
print(dataset.Date[:5])
print("----------------------------------------")
dataset.Date = dataset.Date.apply(pd.to_datetime)
print(dataset.Date[:5])

In [None]:
plt.figure(figsize = [15,5])
plt.title('Distribution of Confirmed Cases vs Death for SARS 2003')
sns.scatterplot(x = dataset['Cumulative_Cases'], y=dataset['Death_Count'])
plt.xlabel('Cumulative number of cases')
plt.ylabel("Deaths occured")

In [None]:
print(dataset.head())
dataset.corr()

In [None]:
plt.figure(figsize = [10,5])
plt.title('Distribution of Confirmed Cases vs Death for SARS 2003 with hue based on Number of Recoveries')
sns.scatterplot(x = dataset['Cumulative_Cases'], y=dataset['Death_Count'], hue = dataset['Recovered_Count'])
plt.xlabel('Cumulative number of cases')
plt.ylabel("Deaths occured")

In [None]:
plt.figure(figsize = [10,5])
plt.title('Regression Line Distribution of Confirmed Cases vs Death for SARS 2003')
sns.regplot(x = dataset['Cumulative_Cases'], y=dataset['Death_Count'])
plt.xlabel('Cumulative number of cases')
plt.ylabel("Deaths occured")

In [None]:
plt.figure(figsize = [10,5])
plt.title('Regression Line Distribution of Confirmed Cases vs Death for SARS 2003')
sns.regplot(x = dataset['Cumulative_Cases'], y=dataset['Recovered_Count'])
plt.xlabel('Cumulative number of cases')
plt.ylabel("Number of Recoveries")

In [None]:
dataset['Day_of_the_year'] = dataset.Date.dt.strftime("%d").astype(int)
dataset['Week_of_the_year'] = dataset.Date.dt.strftime("%w").astype(int)
dataset['Month_of_the_year'] = dataset.Date.dt.strftime("%m").astype(int)
dataset.drop(['Date'], inplace = True, axis = 1)
dataset.head()

In [None]:
dataset.info()

In [None]:
encoded_labels = dataset.select_dtypes(include = "object").columns
print(encoded_labels)
print(dataset.Country)

In [None]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
dataset.Country = LE.fit_transform(dataset.Country)
dataset.Country

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
x = dataset.drop(['Death_Count'] , axis = 1)
y = dataset['Death_Count']
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.35,random_state=42)

In [None]:
model = LinearRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
# y_pred

In [None]:
print("Mean Absolute Error (MAE) - Test data : ", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE) - Test data : ", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE) - Test data : ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Co-efficient of determination (R2 Score): ", metrics.r2_score(y_test, y_pred))

In [None]:
y_pred[:5]