# 1. Business Understanding

##### Global pandemi yang terjadi di dunia saat ini bernama SARS-CoV-2 atau disebut juga sebagai COVID-19 telah memakan banyak korban jiwa. Pemerintah secara khusus memberlakukan pencegahan-pencegahan dari gejala covid yang muncul. Berdasarkan hasil wawancara, research journal dan observasi pada Puskesmas Pakuhaji yang merupakan Unit Pelaksana Teknis Dinas (UPTD) kesehatan kabupaten/kota yang bertanggung jawab untuk memutus mata rantai penyebaran covid khususnya di daerah Pakuhaji dan sekitarnya. Banyak terdapat kasus tanpa gejala dan kurangnya diagnosis mengakibatkan keterlambatan atau bahkan diagnosis yang terlewatkan, membuat pasien terpapar covid. Oleh karena itu penulis mencoba menggunakan pendekatan non-klinis yaitu menggunakan pembelajaran mesin, penambangan data, dan kecerdasan buatan untuk mendiagnosis gejala awal virus serta menekan bertambahnya kasus COVID-19

# 2. Data Understanding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.metrics import roc_curve, auc

%matplotlib inline

In [2]:
# Memuat dataset
df = pd.read_csv('Covid Dataset.csv')  

In [None]:
# Menampilkan semua baris
df

In [None]:
# Melihat dimensi data
print("Dimensi data:", df.shape)

In [None]:
# Menampilkan beberapa baris pertama dan terakhir data
print("Baris pertama data:")
print(df.head())
# print("Baris terakhir data:")
# print(df.tail())

In [None]:
# Mengecek tipe data dari setiap kolom
print("Tipe data setiap kolom:")
print(df.dtypes)

In [None]:
# Menghitung statistik deskriptif untuk kolom numerik
print("Statistik deskriptif untuk kolom numerik:")
print(df.describe())

In [None]:
# Menghitung statistik deskriptif untuk kolom kategorikal
# print("Statistik deskriptif untuk kolom kategorikal:")
# print(df.describe(include=[object]))

In [14]:
# Mengidentifikasi outliers pada kolom numerik
# Menggunakan boxplot untuk visualisasi outliers
# for column in df.select_dtypes(include=[np.number]).columns:
#     plt.figure(figsize=(10, 5))
#     sns.boxplot(x=df[column])
#     plt.title(f'Boxplot of {column}')
#     plt.show()


In [None]:
# Membuat beberapa visualisasi dasar untuk memahami distribusi data
# Histogram untuk kolom numerik
# df.hist(bins=30, figsize=(20, 15))
# plt.show()


# 3. Data Preparation

In [None]:
# Mengecek nilai yang hilang (missing values) di setiap kolom
print("Nilai yang hilang di setiap kolom:")
print(df.isnull().sum())

In [None]:
# # Langkah 1: Menangani Missing Values
# # Mengisi nilai yang hilang dengan mean untuk kolom numerik dan mode untuk kolom kategorikal
# for column in df.select_dtypes(include=[np.number]).columns:
#     df[column].fillna(df[column].mean(), inplace=True)

# for column in df.select_dtypes(include=[object]).columns:
#     df[column].fillna(df[column].mode()[0], inplace=True)

In [12]:
# Langkah 2: Feature Selection yaitu Menghapus kolom tertentu
columns_to_drop = ['Wearing Masks', 'Sanitization from Market']
df = df.drop(columns=columns_to_drop)

In [None]:
df

In [None]:
# Langkah 3: Data Visualization
# Membuat beberapa visualisasi dasar untuk memahami distribusi data
# Fungsi untuk membuat bar chart
def plot_bar_chart(data, column):
    plt.figure(figsize=(8, 6))
    sns.countplot(x=column, data=data, hue=column, palette="deep", legend=False)
    plt.title(f'Distribusi {column}')
    plt.xlabel(column)
    plt.ylabel('Jumlah')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Membuat bar chart untuk setiap atribut
for column in df.columns:
    plot_bar_chart(df, column)


In [None]:
# Membuat bar chart dengan perbandingan terhadap status COVID-19
def plot_stacked_bar_chart(data, column):
    plt.figure(figsize=(10, 6))
    sns.countplot(x=column, hue='COVID-19', data=data)
    plt.title(f'Distribusi {column} berdasarkan Status COVID-19')
    plt.xlabel(column)
    plt.ylabel('Jumlah')
    plt.xticks(rotation=45)
    plt.legend(title='COVID-19', loc='upper right')
    plt.tight_layout()
    plt.show()

# Membuat bar chart bertumpuk untuk setiap atribut (kecuali 'COVID-19')
for column in df.columns:
    if column != 'COVID-19':
        plot_stacked_bar_chart(df, column)

In [None]:
# # Menampilkan distribusi variabel target (COVID-19)
# plt.figure(figsize=(8, 6))
# df['COVID-19'].value_counts().plot(kind='bar', color=['steelblue', 'goldenrod'])
# plt.title('Distribusi Kasus COVID-19')
# plt.xlabel('COVID-19')
# plt.ylabel('Jumlah')
# plt.show()

In [None]:
# Menampilkan distribusi beberapa fitur kunci
fitur_kunci = ['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat']
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for i, fitur in enumerate(fitur_kunci):
    sns.countplot(x=fitur, hue='COVID-19', data=df, ax=axes[i//2, i%2])
    axes[i//2, i%2].set_title(f'Distribusi {fitur}')
plt.tight_layout()
plt.show()

In [None]:
# Menampilkan pie chart untuk kolom COVID-19
explode = (0.2, 0)
# colors = ['steelblue', 'goldenrod']
plt.figure(figsize=(8, 6))
df['COVID-19'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=0, colors=['#007BFF', '#FFA500'], shadow=True, explode=explode)
plt.title('Distribusi Kasus COVID-19')
plt.ylabel('COVID-19')
plt.show()

# plt.figure(figsize=(8, 6))
# plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, explode=explode)
# plt.title('Pie Chart of the Number of Patients by Covid-19 PCR Result')
# plt.ylabel('COVID-19')  # Menambahkan label pada sumbu Y
# plt.show()

In [21]:
# Encoding Variabel Kategorikal
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=[object]).columns:
    df[column] = label_encoder.fit_transform(df[column])


In [None]:
df

In [None]:
# Menampilkan peta panas korelasi untuk kolom numerik
kolom_numerik = df.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(18, 10))
sns.heatmap(df[kolom_numerik].corr(), annot=True, cmap='coolwarm')
plt.title('Peta Panas Korelasi')
plt.show()

In [24]:
# Langkah 5: Memisahkan Fitur dan Target
# Asumsikan kolom target bernama 'COVID-19'
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

In [25]:
# Langkah 6: Data Balancing Menggunakan SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)


In [26]:
# Langkah 7: Membagi Data Menjadi Set Pelatihan dan Pengujian dengan rasio 70%-30%
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)


In [None]:
# Menampilkan ringkasan dari pembagian data setelah SMOTE
print("Dimensi X_train:", X_train.shape)
print("Dimensi y_train:", y_train.shape)
print("Dimensi X_test:", X_test.shape)
print("Dimensi y_test:", y_test.shape)

print("Distribusi kelas pada y_train:")
print(y_train.value_counts())

print("Fitur yang digunakan:", X_train.columns)

# 4. Modelling

In [None]:
# # Langkah 6: Data Balancing Menggunakan SMOTE
# smote = SMOTE(random_state=42)
# X_balanced, y_balanced = smote.fit_resample(X, y)


In [None]:
# # Langkah 4: Membagi Data Menjadi Set Pelatihan dan Pengujian
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# # Menampilkan ringkasan dari pembagian data
# print("Dimensi X_train:", X_train.shape)
# print("Dimensi X_test:", X_test.shape)
# print("Dimensi y_train:", y_train.shape)
# print("Dimensi y_test:", y_test.shape)


# print("Fitur yang digunakan:", X_train.columns)

In [28]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [29]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [30]:
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [31]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

In [32]:
# K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

# 5. Evaluation

In [33]:
from sklearn.metrics import roc_curve, auc

# Fungsi untuk plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


In [35]:
# Fungsi untuk plot ROC curve
def plot_roc_curve(y_true, y_pred_proba, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.show()


In [36]:

# Evaluasi dan visualisasi untuk setiap model
models = {
    'Decision Tree': dt_model,
    'Random Forest': rf_model,
    'Naive Bayes': nb_model,
    'SVM': svm_model,
    'KNN': knn_model
}


In [None]:

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test, y_pred, model_name)
    
    # Plot ROC curve if model supports probability prediction
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        plot_roc_curve(y_test, y_pred_proba, model_name)
    elif model_name == 'SVM':  # SVM with linear kernel uses decision_function
        y_pred_proba = model.decision_function(X_test)
        plot_roc_curve(y_test, y_pred_proba, model_name)

In [None]:
# # Evaluate Decision Tree
# accuracy_dt = accuracy_score(y_test, y_pred_dt)
# classification_report_dt = classification_report(y_test, y_pred_dt)
# confusion_matrix_dt = confusion_matrix(y_test, y_pred_dt)

# print("Decision Tree Accuracy:", accuracy_dt)
# print("Decision Tree Classification Report:\n", classification_report_dt)
# print("Decision Tree Confusion Matrix:\n", confusion_matrix_dt)


In [None]:
# # Evaluate Random Forest
# accuracy_rf = accuracy_score(y_test, y_pred_rf)
# classification_report_rf = classification_report(y_test, y_pred_rf)
# confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)

# print("Random Forest Accuracy:", accuracy_rf)
# print("Random Forest Classification Report:\n", classification_report_rf)
# print("Random Forest Confusion Matrix:\n", confusion_matrix_rf)

In [None]:
# # Evaluate Naive Bayes
# accuracy_nb = accuracy_score(y_test, y_pred_nb)
# classification_report_nb = classification_report(y_test, y_pred_nb)
# confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)

# print("Naive Bayes Accuracy:", accuracy_nb)
# print("Naive Bayes Classification Report:\n", classification_report_nb)
# print("Naive Bayes Confusion Matrix:\n", confusion_matrix_nb)


In [None]:
# # Evaluate SVM
# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# classification_report_svm = classification_report(y_test, y_pred_svm)
# confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# print("SVM Accuracy:", accuracy_svm)
# print("SVM Classification Report:\n", classification_report_svm)
# print("SVM Confusion Matrix:\n", confusion_matrix_svm)

In [None]:
# # Evaluate KNN
# accuracy_knn = accuracy_score(y_test, y_pred_knn)
# classification_report_knn = classification_report(y_test, y_pred_knn)
# confusion_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# print("KNN Accuracy:", accuracy_knn)
# print("KNN Classification Report:\n", classification_report_knn)
# print("KNN Confusion Matrix:\n", confusion_matrix_knn)

In [43]:
# # Compile all results into a dictionary for easy comparison
# model_performance = {
#     'Decision Tree': {
#         'Accuracy': accuracy_dt,
#         'Classification Report': classification_report_dt,
#         'Confusion Matrix': confusion_matrix_dt
#     },
#     'Random Forest': {
#         'Accuracy': accuracy_rf,
#         'Classification Report': classification_report_rf,
#         'Confusion Matrix': confusion_matrix_rf
#     },
#     'Naive Bayes': {
#         'Accuracy': accuracy_nb,
#         'Classification Report': classification_report_nb,
#         'Confusion Matrix': confusion_matrix_nb
#     },
#     'SVM': {
#         'Accuracy': accuracy_svm,
#         'Classification Report': classification_report_svm,
#         'Confusion Matrix': confusion_matrix_svm
#     },
#     'KNN': {
#         'Accuracy': accuracy_knn,
#         'Classification Report': classification_report_knn,
#         'Confusion Matrix': confusion_matrix_knn
#     }
# }

# # Display model performance
# for model_name, performance in model_performance.items():
#     print(f"\nModel: {model_name}")
#     print(f"Accuracy: {performance['Accuracy']}")
#     print("Classification Report:\n", performance['Classification Report'])
#     print("Confusion Matrix:\n", performance['Confusion Matrix'])

# 6. Deployment

In [None]:
import joblib

# Menyimpan model
joblib.dump(dt_model, 'decision_tree_model.pkl')
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(knn_model, 'knn_model.pkl')

In [47]:
# Memuat model
loaded_dt_model = joblib.load('decision_tree_model.pkl')
loaded_rf_model = joblib.load('random_forest_model.pkl')
loaded_nb_model = joblib.load('naive_bayes_model.pkl')
loaded_svm_model = joblib.load('svm_model.pkl')
loaded_knn_model = joblib.load('knn_model.pkl')

In [None]:
# Data baru untuk prediksi (sesuaikan dengan data Anda)
new_data = X_test.iloc[0].to_frame().T  # Mengambil satu baris data sebagai contoh

# Prediksi menggunakan model yang telah dimuat
prediction_dt = loaded_dt_model.predict(new_data)
prediction_rf = loaded_rf_model.predict(new_data)
prediction_nb = loaded_nb_model.predict(new_data)
prediction_svm = loaded_svm_model.predict(new_data)
prediction_knn = loaded_knn_model.predict(new_data)

# Menampilkan hasil prediksi
print(f"Decision Tree Prediction: {'Positif' if prediction_dt[0] == 1 else 'Negatif'}")
print(f"Random Forest Prediction: {'Positif' if prediction_rf[0] == 1 else 'Negatif'}")
print(f"Naive Bayes Prediction: {'Positif' if prediction_nb[0] == 1 else 'Negatif'}")
print(f"SVM Prediction: {'Positif' if prediction_svm[0] == 1 else 'Negatif'}")
print(f"KNN Prediction: {'Positif' if prediction_knn[0] == 1 else 'Negatif'}")