# Imports

In [None]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.inspection import permutation_importance

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn.metrics import classification_report

# Load Dataset

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('fetal_health.csv');

In [None]:
df = df[['baseline value', 'accelerations', 'fetal_movement', 'uterine_contractions', 'light_decelerations',\
        'severe_decelerations', 'prolongued_decelerations', 'abnormal_short_term_variability',\
        'mean_value_of_short_term_variability', 'percentage_of_time_with_abnormal_long_term_variability',\
        'mean_value_of_long_term_variability', 'fetal_health']]

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
normal_count = (df['fetal_health'] == 1).sum()
suspect_count = (df['fetal_health'] == 2).sum()
pathological_count = (df['fetal_health'] == 3).sum()
print("Normal Count: ", normal_count, " Suspec Count: ", suspect_count, " Pathological Count: ", pathological_count)

# Functions

In [None]:
def PrintResults(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))
    print("Balanced Accuracy: ", balanced_accuracy_score(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred, labels=[1,2,3])
    ConfusionMatrixDisplay(cm).plot()
    

In [None]:
def ShowFeatureImportance(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model = model.fit(X_train, y_train)

    hist_dt = pd.DataFrame({'Feature': X.columns, 'Feature importance': model.feature_importances_})
    hist_dt = hist_dt.sort_values(by='Feature importance',ascending=False)
    fig, axes = plt.subplots(figsize=(20, 10))
    sns.barplot(x="Feature", y="Feature importance", data=hist_dt, ax=axes)
    axes.set_title("Feature Importance", fontsize=16)
    axes.set_xlabel("Feature", fontsize=14)
    axes.set_ylabel("Feature Importance", fontsize=14)
    axes.tick_params(axis="x", labelrotation=45, labelsize=12)
    plt.tight_layout()
    plt.show()

In [None]:
selector = SelectKBest(score_func=f_classif, k=8)
selector.fit_transform(X, y)

cols_idxs = selector.get_support(indices=True)
X = df.iloc[:,cols_idxs]

# DecisionTreeClassifier

In [None]:
model_dt = DecisionTreeClassifier()
PrintResults(X, y, model_dt)

In [None]:
ShowFeatureImportance(X, y, model_dt)

# Naive Bayes

In [None]:
model_nb = GaussianNB()
PrintResults(X, y, model_nb)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
model_nb = model_nb.fit(X_train, y_train)

imps_gb = permutation_importance(model_nb, X_train, y_train)
fig, axes = plt.subplots(figsize=(20, 10))

hist_bay = pd.DataFrame({'Feature': X.columns, 'Feature importance': imps_gb.importances_mean})
hist_bay = hist_bay.sort_values(by='Feature importance',ascending=False)
sns.barplot(x="Feature", y="Feature importance", data=hist_bay, ax=axes)
axes.set_title("Gaussian Naive Bayes Feature Importance", fontsize=16)
axes.set_xlabel("Feature", fontsize=14)
axes.set_ylabel("Feature Importance", fontsize=14)
axes.tick_params(axis="x", labelrotation=45, labelsize=12)

plt.tight_layout()
plt.show()

# RandomForestClassifier

In [None]:
model_rf = RandomForestClassifier(n_estimators=200)
PrintResults(X, y, model_rf)

In [None]:
ShowFeatureImportance(X, y, model_rf)

# GradientBoostingClassifier

In [None]:
model_gb = GradientBoostingClassifier()
PrintResults(X, y, model_gb)

In [None]:
ShowFeatureImportance(X, y, model_gb)

# KNN

In [None]:
k_values = np.arange(1, 100)
accuracies = []

for k in k_values:
    model_knn = KNeighborsClassifier(n_neighbors=k)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model_knn.fit(X_train, y_train)
    y_pred = model_knn.predict(X_test)
    accuracies.append(balanced_accuracy_score(y_test, y_pred))

fig = plt.figure()
plt.plot(k_values, accuracies)
plt.xlabel('k')
plt.ylabel('Balanced Accuracy')
plt.title('Balanced Accuracy by k')
plt.show()

In [None]:
best_k = k_values[np.argmax(accuracies)]
model_knn = KNeighborsClassifier(best_k)
PrintResults(X, y, model_knn)