# 1. Business Understanding

# 2. Data Understanding

In [2]:
import pandas as pd
import sweetviz as sv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("Covid Dataset.csv")

df.head()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,...,Yes,Yes,No,Yes,No,Yes,Yes,No,No,Yes
1,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,No,...,Yes,No,No,No,Yes,Yes,No,No,No,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,...,Yes,Yes,Yes,No,No,No,No,No,No,Yes
3,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Yes,...,No,No,Yes,No,Yes,Yes,No,No,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,...,No,Yes,No,Yes,No,Yes,No,No,No,Yes


In [4]:
report = sv.analyze(df)
report.show_html('eda_report.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:02 -> (00:00 left)   


Report eda_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [None]:
# Display the structure and basic information about the dataset
df.info()

In [None]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()

missing_values

In [None]:

# Generate a summary of the dataset's statistics
data_summary = df.describe(include='all')

data_summary

In [None]:
import matplotlib.pyplot as plt

# Visualize the distribution of the target variable "COVID-19"
covid_target_distribution = df['COVID-19'].value_counts()

plt.figure(figsize=(6, 4))
covid_target_distribution.plot(kind='bar')
plt.title('Distribution of COVID-19 Cases')
plt.xlabel('COVID-19 Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()


In [None]:
# Visualize the top correlated features with the target variable
features_to_plot = ['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat', 'Contact with COVID Patient']

plt.figure(figsize=(10, 6))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(2, 3, i)
    df.groupby(feature)['COVID-19'].value_counts(normalize=True).unstack().plot(kind='bar', stacked=True, ax=plt.gca())
    plt.title(feature)
    plt.ylabel('Proportion')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Remove non-informative columns
columns_to_remove = ['Wearing Masks', 'Sanitization from Market']
covid_data_cleaned = df.drop(columns=columns_to_remove)


In [None]:
# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
for column in covid_data_cleaned.columns:
    covid_data_cleaned[column] = label_encoder.fit_transform(covid_data_cleaned[column])


In [None]:
# Check the first few rows of the cleaned data
covid_data_cleaned.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Split the dataset into features (X) and target (y)
X = covid_data_cleaned.drop(columns=['COVID-19'])
y = covid_data_cleaned['COVID-19']


In [None]:

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

# Summary of the split
X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
y_pred_logistic = logistic_model.predict(X_test)


In [None]:
# Evaluate the model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
classification_report_logistic = classification_report(y_test, y_pred_logistic)
confusion_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)

accuracy_logistic, classification_report_logistic, confusion_matrix_logistic


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
random_forest_model = RandomForestClassifier(random_state=42, n_estimators=100)
random_forest_model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
y_pred_rf = random_forest_model.predict(X_test)



In [None]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)

accuracy_rf, classification_report_rf, confusion_matrix_rf

In [None]:
from sklearn.svm import SVC

# Initialize and train the Support Vector Machine model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
classification_report_svm = classification_report(y_test, y_pred_svm)
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm)

accuracy_svm, classification_report_svm, confusion_matrix_svm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the K-Nearest Neighbors (KNN) model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
classification_report_knn = classification_report(y_test, y_pred_knn)
confusion_matrix_knn = confusion_matrix(y_test, y_pred_knn)



In [None]:
from sklearn.naive_bayes import GaussianNB


# Initialize and train the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Evaluate the Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
classification_report_nb = classification_report(y_test, y_pred_nb)
confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)


In [None]:
from sklearn.tree import DecisionTreeClassifier


# Initialize and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Evaluate the Decision Tree model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
classification_report_dt = classification_report(y_test, y_pred_dt)
confusion_matrix_dt = confusion_matrix(y_test, y_pred_dt)

In [None]:

# Compile all results into a dictionary for easy comparison
model_performance = {
    'KNN': {
        'Accuracy': accuracy_knn,
        'Classification Report': classification_report_knn,
        'Confusion Matrix': confusion_matrix_knn
    },
    'Naive Bayes': {
        'Accuracy': accuracy_nb,
        'Classification Report': classification_report_nb,
        'Confusion Matrix': confusion_matrix_nb
    },
    'Decision Tree': {
        'Accuracy': accuracy_dt,
        'Classification Report': classification_report_dt,
        'Confusion Matrix': confusion_matrix_dt
    }
}

model_performance

In [None]:
# Overview of dataset structure and summary statistics
df.info()

In [None]:
from autoviz import AutoViz_Class
%matplotlib inline


In [None]:
AV = AutoViz_Class()

filename = df
target_variable = "COVID-19"

dft = AV.AutoViz(
    "",
    sep=",",
    depVar=target_variable,
    dfte=df,
    header=0,
    verbose=2,
    lowess=False,
    chart_format="svg",
    max_rows_analyzed=500,
    max_cols_analyzed=20,
    save_plot_dir=None
)

In [None]:
# Langkah 3: Membuat laporan EDA menggunakan ydata-profiling
# profile = ProfileReport(df, title="Laporan EDA Covid Dataset", explorative=True)


In [None]:
# Langkah 5: Menampilkan laporan di notebook (opsional)
# profile.to_notebook_iframe()

In [None]:
# AV = AutoViz_Class()

# Langkah 4: Membuat visualisasi otomatis
# Parameter pertama adalah path file, parameter kedua adalah DataFrame
# 'df' adalah DataFrame yang sudah dimuat
# visual = AV.AutoViz(filename="", dfte=df, depVar="", verbose=2, lowess=False)


In [None]:
# Data Understanding
# Check for missing values and data types in the dataset
missing_values = df.isnull().sum()
data_types = df.dtypes

# Statistical summary of the data
stat_summary = df.describe(include='all')

# Displaying the results
{
    "Missing Values": missing_values,
    "Data Types": data_types,
    "Statistical Summary": stat_summary
}


In [None]:
# Langkah 4: Menghapus kolom yang spesifik jika ada dalam dataset
# columns_to_drop = ['Chronic Lung Disease', 'Gastrointestinal', 'Wearing Masks', 'Sanitization from Market']
# columns_to_drop = [col for col in columns_to_drop if col in df.columns]
# df.drop(columns=columns_to_drop, inplace=True)
df = df.drop('Chronic Lung Disease', axis=1)
df = df.drop('Gastrointestinal', axis=1)
df = df.drop('Wearing Masks', axis=1)
df = df.drop('Sanitization from Market', axis=1)


In [None]:
# Tentukan kolom yang akan dihapus
columns_to_drop = [
    "Chronic Lung Disease", 
    "Gastrointestinal ", 
    "Wearing Masks", 
    "Sanitization from Market"
]

# Hapus kolom dari dataset
X_selected = X.drop(columns=columns_to_drop)


In [None]:
# Overview of dataset structure and summary statistics
df.info()

In [None]:
# Overview of dataset structure and summary statistics
df.info()

In [None]:
# Langkah 4: Menyimpan laporan ke file HTML
profile.to_file("eda_report.html")

# 3. Data Preparation

# 4. Modelling

# 5. Evaluation

# 6. Deployment

In [None]:
# Fungsi untuk membuat bar chart
def plot_bar_chart(data, column):
    plt.figure(figsize=(10, 6))
    sns.countplot(x=column, data=data)
    plt.title(f'Distribusi {column}')
    plt.xlabel(column)
    plt.ylabel('Jumlah')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Membuat bar chart untuk setiap atribut
for column in df.columns:
    plot_bar_chart(df, column)

# Membuat bar chart dengan perbandingan terhadap status COVID-19
def plot_stacked_bar_chart(data, column):
    plt.figure(figsize=(10, 6))
    sns.countplot(x=column, hue='COVID-19', data=data)
    plt.title(f'Distribusi {column} berdasarkan Status COVID-19')
    plt.xlabel(column)
    plt.ylabel('Jumlah')
    plt.xticks(rotation=45)
    plt.legend(title='COVID-19', loc='upper right')
    plt.tight_layout()
    plt.show()

# Membuat bar chart bertumpuk untuk setiap atribut (kecuali 'COVID-19')
for column in df.columns:
    if column != 'COVID-19':
        plot_stacked_bar_chart(df, column)

In [None]:
# Display the distribution of the target variable (COVID-19)
plt.figure(figsize=(8, 6))
df['COVID-19'].value_counts().plot(kind='bar')
plt.title('Distribution of COVID-19 Cases')
plt.xlabel('COVID-19')
plt.ylabel('Count')
plt.show()

# Display correlation heatmap for numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Display the distribution of a few key features
key_features = ['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat']
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for i, feature in enumerate(key_features):
    sns.countplot(x=feature, hue='COVID-19', data=df, ax=axes[i//2, i%2])
    axes[i//2, i%2].set_title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()