# IMPORTING NECESSARY PACKAGES

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [22]:
import pickle
import json

# READING CSV FILE

In [23]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,location,date,total_cases_per_million,total_deaths_per_million
0,Afghanistan,31-12-2019,,
1,Afghanistan,01-01-2020,,
2,Afghanistan,02-01-2020,,
3,Afghanistan,03-01-2020,,
4,Afghanistan,04-01-2020,,


# DATA PREPROCESSING

In [24]:
df['year'] = df.date.str.slice(start=6)
df['month'] = df.date.str.slice(start=3, stop=5)
df['day'] = df.date.str.slice(start=0, stop=2)
df.head()

Unnamed: 0,location,date,total_cases_per_million,total_deaths_per_million,year,month,day
0,Afghanistan,31-12-2019,,,2019,12,31
1,Afghanistan,01-01-2020,,,2020,1,1
2,Afghanistan,02-01-2020,,,2020,1,2
3,Afghanistan,03-01-2020,,,2020,1,3
4,Afghanistan,04-01-2020,,,2020,1,4


In [25]:
df.isnull().sum()

location                        0
date                            0
total_cases_per_million      3615
total_deaths_per_million    13205
year                            0
month                           0
day                             0
dtype: int64

In [26]:
df = df.fillna(0)

In [27]:
df = pd.concat([df, pd.get_dummies(df.location)], axis = 'columns')
df.head()

Unnamed: 0,location,date,total_cases_per_million,total_deaths_per_million,year,month,day,Afghanistan,Albania,Algeria,...,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
0,Afghanistan,31-12-2019,0.0,0.0,2019,12,31,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Afghanistan,01-01-2020,0.0,0.0,2020,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Afghanistan,02-01-2020,0.0,0.0,2020,1,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Afghanistan,03-01-2020,0.0,0.0,2020,1,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Afghanistan,04-01-2020,0.0,0.0,2020,1,4,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df = df.drop(['date', 'location'], axis = 'columns')
df.head()

Unnamed: 0,total_cases_per_million,total_deaths_per_million,year,month,day,Afghanistan,Albania,Algeria,Andorra,Angola,...,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
0,0.0,0.0,2019,12,31,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,2020,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,2020,1,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,2020,1,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,2020,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
X = df.drop(['total_cases_per_million', 'total_deaths_per_million'], axis = 'columns')
X.head()

Unnamed: 0,year,month,day,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua and Barbuda,...,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
0,2019,12,31,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020,1,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020,1,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
data = {
    'columns': [col for col in X]
}
with open('data.json', 'w') as f:
    f.write(json.dumps(data))

# TOTAL CASES MODEL

In [31]:
y = df.total_cases_per_million
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: total_cases_per_million, dtype: float64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [33]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.5626370765338968

In [34]:
with open('total_cases_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

# TOTAL DEATHS MODEL

In [35]:
y = df.total_deaths_per_million
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: total_deaths_per_million, dtype: float64

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 9)

In [37]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.6802235030859808

In [38]:
with open('total_deaths_model.pickle', 'wb') as f:
    pickle.dump(lr, f)