# Imports and Data Handling

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt
import src as tools
from tensorflow import keras
from tensorflow.keras import layers
from imblearn.over_sampling import SMOTE
#import seaborn as sns

In [None]:
np.random.seed(0)

In [None]:
df = pd.read_csv("C:\\Users\\sharv\\184\\final\\FetusHealthML\\Project\\fetal_health.csv")

#As discussed in the project proposal, we will experiment with using only the first 7 features that are actual recordings of
#the patients monitoring.
short_df = df[['baseline value','accelerations','fetal_movement','uterine_contractions'
               ,'light_decelerations','severe_decelerations','prolongued_decelerations']]

y_labels = df[['fetal_health']]

df = df.loc[:,df.columns!="fetal_health"]

df = df.to_numpy()
short_df = short_df.to_numpy()
y_labels = y_labels.to_numpy()
y_labels = y_labels-1

print("Original dataset:",df.shape[0],"datapoints,",df.shape[1],"features")
print("Shortened Dataset:",short_df.shape[0],"datapoints,",short_df.shape[1],"features")
print("True labels of Dataset:",y_labels.shape)
#Thankfully all the values are numerical, no need to reencode them
# TODO: MAYBE WANT TO NORMALIZE DATASET? CONSIDER DELETING SEVERE_DECELERATIONS COL SINCE ITS ALMOST ALL 0??

In [None]:
#With the dataset prepared, we must split it into training, validation, and testing sets.
#Note: The validation set is really only for the Neural Network model. We will do another split for it seperately
x_train, x_test, y_train, y_test = train_test_split(short_df, y_labels, test_size=0.2,stratify=y_labels)
print("Training Set size:",x_train.shape)
print("Training Targt size", y_train.shape)
print("Testing Set size:",x_test.shape)

# KNN(small dataset & cross validation)

In [None]:
#Using the KNNClassifier from the sklearnknn_classifier.fit(X_train, y_train) library, we will experiment with different values of k.

KNN_crossValidationErrors = [0]*250

x_train = short_df
y_train = y_labels


display(x_train)
x_train = np.array(x_train)
y_train = np.array(y_train)


for k in range(250):
    k = k+1 #1-250 not 0-249
    if(k%25==0):
        print(k/2.5) #Scuffed progress bar
    # Cross-validation with 5 fold
    nFolds = 5
    for iFold in range(nFolds):
        Xti, Xvi, Yti, Yvi = tools.crossValidate(x_train, y_train,nFolds, iFold)
        knnClassifier = KNeighborsClassifier(n_neighbors=k)
        knnClassifier.fit(Xti, Yti)
    
        cross_validation_pred = knnClassifier.predict(Xvi)
    
        cross_validation_accuracy = accuracy_score(Yvi,cross_validation_pred)

        cross_validation_error = 1-cross_validation_accuracy
        KNN_crossValidationErrors[k-1] += cross_validation_error
    KNN_crossValidationErrors[k-1] = KNN_crossValidationErrors[k-1]/nFolds

plt.semilogx(range(1, len(KNN_crossValidationErrors) + 1), KNN_crossValidationErrors, color='g')
plt.xlabel('k Value')
plt.ylabel('Error Rate')
plt.show()

print("Lowest error rate is",min(KNN_crossValidationErrors),"at k =",KNN_crossValidationErrors.index(min(KNN_crossValidationErrors))+1)
# k = KNN_testErrors.index(min(KNN_testErrors))+1 because it's pulling the error rate from a list (starts at 0)
#But k values actually start at k = 1
bestk = KNN_crossValidationErrors.index(min(KNN_crossValidationErrors))+1


#Confusion Matrix for best performing KNN.
x_train, x_test, y_train, y_test = train_test_split(short_df, y_labels, test_size=0.2,stratify=y_labels)
print("Training Set size:",x_train.shape)
print("Training Targt size", y_train.shape)
print("Testing Set size:",x_test.shape)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

knnClassifier = KNeighborsClassifier(n_neighbors=bestk)
knnClassifier.fit(x_train, y_train)
y_test_pred = knnClassifier.predict(x_test)
matrix = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=matrix)
disp.plot()
print("Accuracy score: {}".format(accuracy_score(y_test,y_test_pred)))
print("F1 score: {}".format(f1_score(y_test, y_test_pred,average="weighted")))

# KNN(normal dataset & NO cross validation)

In [None]:
#We will repeat the same process, but with the entire dataset. To save time, I will only test for up to k=100. 
#In all likelihood, the best k value will reveal itself early on
x_train2, x_test2, y_train2, y_test2 = train_test_split(df, y_labels, test_size=0.2,stratify=y_labels)
print("Training Set size:",x_train2.shape)
print("Testing Set size:",x_test2.shape)

KNN_testErrors = [] #index will represent k value. Testing up to k = 100
KNN_trainErrors = []
for k in range(100):
    k = k+1 #1-100, not 0-99
    if(k%10==0):
        print(k) #Scuffed progress bar
    knnClassifier = KNeighborsClassifier(n_neighbors=k)
    knnClassifier.fit(x_train2, y_train2.ravel())
    
    y_train_pred2 = knnClassifier.predict(x_train2)
    y_test_pred2 = knnClassifier.predict(x_test2)
    
    train_accuracy2 = accuracy_score(y_train2,y_train_pred2)
    test_accuracy2 = accuracy_score(y_test2,y_test_pred2)
    train_error2 = 1-train_accuracy2
    test_error2 = 1-test_accuracy2
    KNN_trainErrors.append(train_error2)
    KNN_testErrors.append(test_error2)
    
plt.semilogx(range(1, len(KNN_trainErrors) + 1), KNN_trainErrors, color='g')
plt.semilogx(range(1, len(KNN_testErrors) + 1), KNN_testErrors, color='r')
plt.xlabel('k Value')
plt.ylabel('Error Rate')
plt.show()

bestk = KNN_testErrors.index(min(KNN_testErrors))+1
print("Lowest error rate is",min(KNN_testErrors),"at k =",bestk)

knnClassifier = KNeighborsClassifier(n_neighbors=bestk)
knnClassifier.fit(x_train2, y_train2.ravel())
y_test_pred2 = knnClassifier.predict(x_test2)

matrix = confusion_matrix(y_test2, y_test_pred2)
disp = ConfusionMatrixDisplay(confusion_matrix=matrix)
disp.plot()
print("Accuracy: {}".format(accuracy_score(y_test2,y_test_pred2)))
print("F1 Score: {}".format(f1_score(y_test2, y_test_pred2, average="weighted")))

# KNN(normal dataset & cross validation)

In [None]:
#We will repeat the same process, but with the entire dataset. To save time, I will only test for up to k=100. 
#In all likelihood, the best k value will reveal itself early on
x_train2 = df
y_train2 = y_labels
print("Training Set size:",x_train2.shape)
print("Training Labels size:",)

x_train2 = np.array(x_train2)
y_train2 = np.array(y_train2)

KNN_testErrors = [0]*100 #index will represent k value. Testing up to k = 100
KNN_trainErrors = [0]*100
for k in range(100):
    k = k+1 #1-100, not 0-99
    if(k%10==0):
        print(k) #Scuffed progress bar
    nFolds = 5
    for iFold in range(nFolds):
        Xti, Xvi, Yti, Yvi = tools.crossValidate(x_train2, y_train2,nFolds, iFold)
        knnClassifier = KNeighborsClassifier(n_neighbors=k)
        knnClassifier.fit(Xti, Yti)
    
        y_train_pred2 = knnClassifier.predict(Xvi)
    
        train_accuracy2 = accuracy_score(Yvi,y_train_pred2)
        train_error2 = 1-train_accuracy2
        KNN_trainErrors[k-1] += train_error2
    KNN_trainErrors[k-1] = KNN_trainErrors[k-1]/5
    
plt.semilogx(range(1, len(KNN_trainErrors) + 1), KNN_trainErrors, color='g')
plt.xlabel('k Value')
plt.ylabel('Error Rate')
plt.show()

bestk = KNN_trainErrors.index(min(KNN_trainErrors))+1
print("Lowest cross-validation error rate is",min(KNN_trainErrors),"at k =",bestk)

x_train2, x_test2, y_train2, y_test2 = train_test_split(df, y_labels, test_size=0.2,stratify=y_labels)
knnClassifier = KNeighborsClassifier(n_neighbors=bestk)
knnClassifier.fit(x_train2, y_train2.ravel())
y_test_pred2 = knnClassifier.predict(x_test2)
matrix = confusion_matrix(y_test2, y_test_pred2)
disp = ConfusionMatrixDisplay(confusion_matrix=matrix)
disp.plot()
print("Accuracy: {}".format(accuracy_score(y_test2,y_test_pred2)))
print("F1 Score: {}".format(f1_score(y_test2, y_test_pred2, average="weighted")))

Overall, KNN had some decent predicting power. As expected, as more features became considered, KNN's prediction accuracy on testing sets declined significantly. 

## MLP 

In [None]:
#We'll start with a simple one
model = keras.Sequential(name="FetusHealthNNmodel1")
model.add(layers.Dense(25, activation='relu',input_dim=7))
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(3,activation='softmax'))

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

In [None]:
nFolds = 5
testingCrossValidationError = []
f1scoreAvg = []
for iFold in range(nFolds):
    Xti, Xvi, Yti, Yvi = tools.crossValidate(short_df, y_labels,nFolds, iFold)
    model.fit(Xti, Yti, epochs=5)
    y_test_pred = np.argmax(model.predict(Xvi),axis=1) #to get the prediction based off of the softmax results
    print(y_test_pred)
    test_accuracy = accuracy_score(Yvi,y_test_pred)
    testingCrossValidationError.append(test_accuracy)
    f1scoreAvg.append(f1_score(Yvi, y_test_pred,average="weighted"))

In [None]:
avgTestAccuracy = sum(testingCrossValidationError)/nFolds
print("Testing Accuracy:",avgTestAccuracy)
print("F1 Score Avg:",sum(f1scoreAvg)/nFolds)

In [None]:
#Repeating same NN model but with the whole dataset
model = keras.Sequential(name="FetusHealthNNmodel2")
model.add(layers.Dense(25, activation='relu',input_dim=21))
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(3,activation='softmax'))

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

In [None]:
nFolds = 5
testingCrossValidationError = []
f1scoreAvg=[]
for iFold in range(nFolds):
    Xti, Xvi, Yti, Yvi = tools.crossValidate(df, y_labels,nFolds, iFold)
    model.fit(Xti, Yti, epochs=5)
    y_test_pred = np.argmax(model.predict(Xvi),axis=1) #to get the prediction based off of the softmax results
    #print(y_test_pred)
    test_accuracy = accuracy_score(Yvi,y_test_pred)
    testingCrossValidationError.append(test_accuracy)
    f1scoreAvg.append(f1_score(Yvi, y_test_pred,average="weighted"))

In [None]:
avgTestAccuracy = sum(testingCrossValidationError)/nFolds
print("Testing Accuracy:",avgTestAccuracy)
print("F1 Score Avg:",sum(f1scoreAvg)/nFolds)

In [None]:
# The models are performing poorly most likely because the data set is heavily imbalanced towards the 0 class compared 
#to 1 and 2. To address this, the model will be trained with class weights. Entire dataset is still used.

#EDIT: THE MODELS KEPT GUESSING ONE LABEL BECAUSE OF SGD OPTIMIZER. ONCE MODIFIED TO ADAM OPTIMIZER, IT STARTED PERFORMING
#BETTER. APPROXIMATELY 88 PERCENT ACCURACY.
unique, counts = np.unique(y_labels, return_counts=True)
class_weights = dict(zip(unique, counts))
#print(class_weights)
#class weights created using inverse class frequency method
class_weights[0] = y_labels.size/(3*class_weights[0])
class_weights[1] =  y_labels.size/(3*class_weights[1])
class_weights[2] =  y_labels.size/(3*class_weights[2])
print(class_weights)


In [None]:
#With the weights, we can recompile the model, and hopefully obtain better results.
#Repeating same NN model but with the whole dataset
model = keras.Sequential(name="FetusHealthNNmodel2")
model.add(layers.Dense(25, activation='relu',input_dim=21))
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(3,activation='softmax'))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

In [None]:
nFolds = 5
testingCrossValidationError = []
f1scoreAvg=[]
for iFold in range(nFolds):
    Xti, Xvi, Yti, Yvi = tools.crossValidate(df, y_labels,nFolds, iFold)
    model.fit(Xti, Yti, epochs=5,class_weight=class_weights)
    y_test_pred = np.argmax(model.predict(Xvi),axis=1) #to get the prediction based off of the softmax results
    #print(y_test_pred)
    test_accuracy = accuracy_score(Yvi,y_test_pred)
    testingCrossValidationError.append(test_accuracy)
    f1scoreAvg.append(f1_score(Yvi, y_test_pred,average="weighted"))

In [None]:
avgTestAccuracy = sum(testingCrossValidationError)/nFolds
print("Testing Accuracy:",avgTestAccuracy)
print("F1 Score Avg:",sum(f1scoreAvg)/nFolds)

In [None]:
#Unfortuantely, class weights didn't seem to improve the accuracy at all.
#Instead of informing the model of class weights, we will be using SMOTE (Synthetic Minority Oversampling Technique) to
#forcefully balance the dataset. Hopefully, this will yield more promising results...
#After the data is split for a fold, I'll use imblearn's SMOTE algorithm to create a more balanced dataset.
#This introduces another variable (how we choose to balance our dataset) 
#With the weights, we can recompile the model, and hopefully obtain better results.
#Repeating same NN model but with the whole dataset
model = keras.Sequential(name="FetusHealthNNmodel2")
model.add(layers.Dense(25, activation='relu',input_dim=21))
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(3,activation='softmax'))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

In [None]:
nFolds = 5
testingCrossValidationError = []
f1scoreAvg=[]
for iFold in range(nFolds):
    Xti, Xvi, Yti, Yvi = tools.crossValidate(df, y_labels,nFolds, iFold)
    #After creating the split for this fold, we must rebalance this dataset
    smoteMachine = SMOTE(random_state=42,k_neighbors=7)
    XtiResample, YtiResample = smoteMachine.fit_resample(Xti, Yti)
    model.fit(XtiResample, YtiResample, epochs=5)
    y_test_pred = np.argmax(model.predict(Xvi),axis=1) #to get the prediction based off of the softmax results
    #print(y_test_pred)
    test_accuracy = accuracy_score(Yvi,y_test_pred)
    testingCrossValidationError.append(test_accuracy)
    f1scoreAvg.append(f1_score(Yvi, y_test_pred,average="weighted"))

In [None]:
avgTestAccuracy = sum(testingCrossValidationError)/nFolds
print("Testing Accuracy:",avgTestAccuracy)
print("F1 Score Avg:",sum(f1scoreAvg)/nFolds)