In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv(r'DOH COVID Data Drop 20201022 - 04 Case Information.csv')

In [3]:
data.describe()

Unnamed: 0,Age
count,361693.0
mean,38.080931
std,16.583849
min,0.0
25%,26.0
50%,35.0
75%,49.0
max,107.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363888 entries, 0 to 363887
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CaseCode           363888 non-null  object 
 1   Age                361693 non-null  float64
 2   AgeGroup           361693 non-null  object 
 3   Sex                363888 non-null  object 
 4   DateSpecimen       308027 non-null  object 
 5   DateResultRelease  309246 non-null  object 
 6   DateRepConf        363888 non-null  object 
 7   DateDied           6753 non-null    object 
 8   DateRecover        83813 non-null   object 
 9   RemovalType        319116 non-null  object 
 10  Admitted           363737 non-null  object 
 11  RegionRes          360182 non-null  object 
 12  ProvRes            338949 non-null  object 
 13  CityMunRes         328393 non-null  object 
 14  CityMuniPSGC       328393 non-null  object 
 15  BarangayRes        135168 non-null  object 
 16  Ba

In [5]:
data = data.sample(50000)

In [6]:
#TODO: DATA CLEANING - DATE SPECIMEN DROPPING = 1 MONTH (using DateSpecimen col)

df = data.drop(data.loc[:, 'CaseCode':'Age'].columns, axis=1)
df = df.drop('Sex', axis=1)
df = df.drop(df.loc[:, 'DateResultRelease':'Admitted'].columns, axis=1)
df = df.drop('ProvRes', axis=1)
df = df.drop(df.loc[:, 'CityMuniPSGC':'BarangayPSGC'].columns, axis=1)
df = df.drop(df.loc[:, 'DateOnset':'ValidationStatus'].columns, axis=1)
df

Unnamed: 0,AgeGroup,DateSpecimen,RegionRes,CityMunRes,HealthStatus,Quarantined
301525,15 to 19,2020-10-06,Region IV-A: CALABARZON,LEMERY,MILD,NO
269726,30 to 34,2020-07-08,NCR,,RECOVERED,NO
292443,25 to 29,2020-03-09,ROF,,RECOVERED,YES
338449,35 to 39,2020-08-27,NCR,CITY OF PASIG,RECOVERED,YES
102913,65 to 69,,NCR,CITY OF MARIKINA,RECOVERED,NO
...,...,...,...,...,...,...
84864,25 to 29,2020-07-18,NCR,CITY OF MAKATI,RECOVERED,NO
75489,25 to 29,2020-06-28,NCR,CITY OF MALABON,RECOVERED,NO
72607,30 to 34,2020-07-13,NCR,CALOOCAN CITY,RECOVERED,YES
361592,20 to 24,2020-08-31,CARAGA,BUENAVISTA,RECOVERED,YES


In [7]:
start_date = '2020-09-10'
end_date = '2020-10-10'

df = df.loc[df['RegionRes'] == 'NCR']
df = df.loc[(df['CityMunRes'] == 'CITY OF MANILA') | (df['CityMunRes'] == 'QUEZON CITY')]
df = df.loc[(df['DateSpecimen'] > start_date) & ((df['DateSpecimen'] <= end_date))]
df = df.reset_index(drop=True)
df

Unnamed: 0,AgeGroup,DateSpecimen,RegionRes,CityMunRes,HealthStatus,Quarantined
0,30 to 34,2020-10-06,NCR,CITY OF MANILA,MILD,NO
1,25 to 29,2020-09-23,NCR,CITY OF MANILA,RECOVERED,NO
2,20 to 24,2020-10-01,NCR,CITY OF MANILA,MILD,NO
3,20 to 24,2020-09-18,NCR,CITY OF MANILA,RECOVERED,NO
4,30 to 34,2020-10-02,NCR,CITY OF MANILA,MILD,NO
...,...,...,...,...,...,...
935,20 to 24,2020-10-05,NCR,CITY OF MANILA,MILD,NO
936,70 to 74,2020-10-06,NCR,CITY OF MANILA,MILD,NO
937,30 to 34,2020-09-28,NCR,QUEZON CITY,MILD,NO
938,35 to 39,2020-10-08,NCR,CITY OF MANILA,MILD,NO


In [8]:
df = df.dropna()
df = df.reset_index(drop=True)
df

Unnamed: 0,AgeGroup,DateSpecimen,RegionRes,CityMunRes,HealthStatus,Quarantined
0,30 to 34,2020-10-06,NCR,CITY OF MANILA,MILD,NO
1,25 to 29,2020-09-23,NCR,CITY OF MANILA,RECOVERED,NO
2,20 to 24,2020-10-01,NCR,CITY OF MANILA,MILD,NO
3,20 to 24,2020-09-18,NCR,CITY OF MANILA,RECOVERED,NO
4,30 to 34,2020-10-02,NCR,CITY OF MANILA,MILD,NO
...,...,...,...,...,...,...
935,20 to 24,2020-10-05,NCR,CITY OF MANILA,MILD,NO
936,70 to 74,2020-10-06,NCR,CITY OF MANILA,MILD,NO
937,30 to 34,2020-09-28,NCR,QUEZON CITY,MILD,NO
938,35 to 39,2020-10-08,NCR,CITY OF MANILA,MILD,NO


In [9]:
df['HealthStatus'].value_counts()

RECOVERED       476
MILD            432
DIED             15
ASYMPTOMATIC      8
SEVERE            7
CRITICAL          2
Name: HealthStatus, dtype: int64

In [10]:
unique = df['CityMunRes'].unique().tolist()
uniAgeG = df['AgeGroup'].unique().tolist()
uniHealth = df['HealthStatus'].unique().tolist()
uniRegion = df['RegionRes'].unique().tolist()

In [11]:
for i in range(len(df)):
    for j in unique:
        if df['CityMunRes'][i] == j:
            df['CityMunRes'][i] = unique.index(j)
    
    for k in uniAgeG:
        if df['AgeGroup'][i] == k:
            df['AgeGroup'][i] = uniAgeG.index(k)

    for l in uniHealth:
        if df['HealthStatus'][i] == l:
            df['HealthStatus'][i] = uniHealth.index(l)
            
    for m in uniRegion:
        if df['RegionRes'][i] == m:
            df['RegionRes'][i] = uniHealth.index(m)

ValueError: 'NCR' is not in list

In [None]:
column = df.loc[:,'AgeGroup':'CityMunRes'].columns.to_list() # SUBJECT TO CHANGE
x = df[column].astype('int')
y = df['HealthStatus'].astype('int')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(x_train, y_train)

In [None]:
y_pred = logreg.predict(x_test)

In [None]:
matrix = metrics.confusion_matrix(y_test, y_pred)
matrix

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
import seaborn as sns

In [None]:
uniHealth

In [None]:
plt.figure(figsize=(9,9))
ax = sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="coolwarm",fmt='g')
ax.xaxis.set_label_position("top")
ax.set_xticklabels([uniHealth[0],uniHealth[1],uniHealth[2],uniHealth[3],uniHealth[4],uniHealth[5]])
ax.set_yticklabels([uniHealth[0],uniHealth[1],uniHealth[2],uniHealth[3],uniHealth[4],uniHealth[5]])
plt.title('Matrix')
plt.show()