In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
covid_full_dataset = pd.read_csv(r"D:\UrusanKuliah\Perkuliahan\Semester_4\MachineLearning\tugas\Praktikum\minggu5\Covid-Data\covid-data.csv")

covid_full_dataset.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


In [3]:
print(covid_full_dataset.columns)

len(covid_full_dataset.columns)

Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')


21

### Feature selection

In [4]:
unuseful_features = ["MEDICAL_UNIT", "USMER", "PATIENT_TYPE", "DATE_DIED", "OTHER_DISEASE"]

covid_with_relevan_features = covid_full_dataset.drop(axis=1, columns=unuseful_features)

covid_with_relevan_features.columns

Index(['SEX', 'INTUBED', 'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD',
       'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')

In [5]:
X = covid_with_relevan_features.drop(axis=1, columns=["CLASIFFICATION_FINAL"])
y = covid_with_relevan_features["CLASIFFICATION_FINAL"]

In [6]:
bestfeatures = SelectKBest(score_func=f_classif, k=21)
fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# menggabungkan 2 dataframe
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

featureScores.sort_values(by="Score", ascending=False)



Unnamed: 0,Specs,Score
14,ICU,7760.218121
1,INTUBED,7756.203909
2,PNEUMONIA,5971.657771
3,AGE,4712.84811
4,PREGNANT,633.867448
0,SEX,629.999687
7,ASTHMA,87.941641
10,CARDIOVASCULAR,87.068478
6,COPD,85.026254
13,TOBACCO,79.871281


In [7]:
covid_selected_features_dataset = covid_with_relevan_features.loc[:, featureScores.nlargest(6,'Score')["Specs"].to_numpy()]
covid_selected_features_dataset["CLASIFFICATION_FINAL"] = covid_with_relevan_features["CLASIFFICATION_FINAL"]

covid_selected_features_dataset.head()

Unnamed: 0,ICU,INTUBED,PNEUMONIA,AGE,PREGNANT,SEX,CLASIFFICATION_FINAL
0,97,97,1,65,2,1,3
1,97,97,1,72,97,2,5
2,2,1,2,55,97,2,3
3,97,97,2,53,2,1,7
4,97,97,2,68,97,2,3


### Training model

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_age = pd.DataFrame(scaler.fit_transform(pd.DataFrame(covid_selected_features_dataset["AGE"])), columns=["AGE"])

covid_selected_features_dataset["AGE"] = scaled_age

covid_selected_features_dataset

Unnamed: 0,ICU,INTUBED,PNEUMONIA,AGE,PREGNANT,SEX,CLASIFFICATION_FINAL
0,97,97,1,1.372531,2,1,3
1,97,97,1,1.786551,97,2,5
2,2,1,2,0.781073,97,2,3
3,97,97,2,0.662781,2,1,7
4,97,97,2,1.549968,97,2,3
...,...,...,...,...,...,...,...
1048570,97,97,2,-0.106114,97,2,7
1048571,2,2,2,0.544490,97,2,7
1048572,97,97,2,0.781073,97,2,7
1048573,97,97,2,-0.815863,97,2,7


In [13]:
X_train, X_test, y_train, y_test = train_test_split(covid_selected_features_dataset, y, test_size=0.01, random_state=42)

In [14]:
print(len(X_test))
print(len(X_train))

10486
1038089


In [15]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


### Model evaluation

In [16]:
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  96    0    0    0    0    0    0]
 [   0   22    1    0    0    0    0]
 [   0    0 3760    0    0    0    0]
 [   0    0    1   29    0    0    0]
 [   0    0    0    0  304    0    0]
 [   0    0    0    0    0 1221    0]
 [   0    0    0    0    0    0 5052]]


In [17]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.9998092695021934
