In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("cases.csv")

In [3]:
data.head()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES
0,3/1/2020,Brussels,Brussels,19-Oct,M,1
1,3/1/2020,Brussels,Brussels,19-Oct,F,1
2,3/1/2020,Brussels,Brussels,20-29,M,1
3,3/1/2020,Brussels,Brussels,30-39,F,1
4,3/1/2020,Brussels,Brussels,40-49,F,1


In [4]:
# Calculating total columns
total_rows = data.shape[0]
print("Total columns in the dataset are: {}".format(total_rows))

Total columns in the dataset are: 8744


In [5]:
# Counting missing values.
total_missing = total_rows - data.count()
print(total_missing)

DATE         45
PROVINCE    513
REGION      513
AGEGROUP     95
SEX         105
CASES         0
dtype: int64


In [6]:
# Dropping missing values from the dataset.
data = data.dropna(how="any")

In [7]:
# Verifying the counts
print("The number of missing values in the dataset are: {}".format(np.count_nonzero(data.isnull())))

The number of missing values in the dataset are: 0


In [8]:
# Checking duplicate rows
duplicate_rows = data[data.duplicated()]
print("The total number of duplicate rows in the reviews dataset are: {}".format(duplicate_rows.shape[0]))
duplicate_rows.head()

The total number of duplicate rows in the reviews dataset are: 0


Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES


In [9]:
data.shape

(8063, 6)

In [10]:
gender_dummy = pd.get_dummies(data['SEX'], prefix = 'gender', drop_first = True)

region_dummy = pd.get_dummies(data['REGION'], prefix = 'region', drop_first = True)

del data['PROVINCE']
data = pd.concat([data ,gender_dummy, region_dummy], axis=1)
data.head()

Unnamed: 0,DATE,REGION,AGEGROUP,SEX,CASES,gender_M,region_Flanders,region_Wallonia
0,3/1/2020,Brussels,19-Oct,M,1,1,0,0
1,3/1/2020,Brussels,19-Oct,F,1,0,0,0
2,3/1/2020,Brussels,20-29,M,1,1,0,0
3,3/1/2020,Brussels,30-39,F,1,0,0,0
4,3/1/2020,Brussels,40-49,F,1,0,0,0


In [11]:
data.corr()

Unnamed: 0,CASES,gender_M,region_Flanders,region_Wallonia
CASES,1.0,-0.201233,0.102704,-0.106079
gender_M,-0.201233,1.0,0.011645,-0.017906
region_Flanders,0.102704,0.011645,1.0,-0.809902
region_Wallonia,-0.106079,-0.017906,-0.809902,1.0


In [12]:
corr = data.corr()
corr.style.background_gradient(cmap='PuBu')

Unnamed: 0,CASES,gender_M,region_Flanders,region_Wallonia
CASES,1.0,-0.201233,0.102704,-0.106079
gender_M,-0.201233,1.0,0.011645,-0.017906
region_Flanders,0.102704,0.011645,1.0,-0.809902
region_Wallonia,-0.106079,-0.017906,-0.809902,1.0


In [13]:
import statsmodels.api as sm
X = data[['gender_M','region_Flanders', 'region_Wallonia']]
X_constant = sm.add_constant(X)
y = data['CASES']
linreg2 = sm.OLS(y, X_constant).fit()
linreg2.summary()

0,1,2,3
Dep. Variable:,CASES,R-squared:,0.053
Model:,OLS,Adj. R-squared:,0.053
Method:,Least Squares,F-statistic:,151.2
Date:,"Thu, 07 May 2020",Prob (F-statistic):,2.4099999999999998e-95
Time:,15:38:18,Log-Likelihood:,-26729.0
No. Observations:,8063,AIC:,53470.0
Df Residuals:,8059,BIC:,53490.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.2932,0.242,30.152,0.000,6.819,7.767
gender_M,-2.7808,0.148,-18.732,0.000,-3.072,-2.490
region_Flanders,0.6449,0.253,2.550,0.011,0.149,1.141
region_Wallonia,-1.0021,0.259,-3.872,0.000,-1.509,-0.495

0,1,2,3
Omnibus:,6903.849,Durbin-Watson:,1.233
Prob(Omnibus):,0.0,Jarque-Bera (JB):,340021.674
Skew:,3.861,Prob(JB):,0.0
Kurtosis:,33.862,Cond. No.,7.24
