<a href="https://colab.research.google.com/github/srujayreddyv/CSC219-P1-HeartDiseaseDetection/blob/main/CSC219TeamProject_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CSC 219- Machine Learning (Fall 2023)


# PROJECT: 1 Heart Disease Detection using Neural Networks
### Team Challengers:
### 1. Alekya Paladugu
### 2. Samah Eltayeb
### 3. Srujay Reddy

# 1. 0 UPLOADING THE DATASET

In [None]:
from google.colab import files
uploaded = files.upload()

# 2.0 IMPORTS

In [None]:
import numpy as np
import pandas as pd

import scipy as sp
from scipy.stats import zscore

import sklearn as sk
import tensorflow as tf
%load_ext tensorboard

import io
import os
import sys
import requests
import datetime
import shutil
from collections.abc import Sequence

%matplotlib inline
import matplotlib as pltLib
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.feature_extraction.text as sk_text
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers

# Set random seed for TensorFlow
tf.random.set_seed(42)

# Set random seed for NumPy
np.random.seed(42)

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)


# 3.0 UNDERSTANDING THE DATA


## 3.1 READING THE DATASET  

In [None]:
Dataset_CSV = 'heart_statlog_cleveland_hungary_final.csv'

df= pd.read_csv(Dataset_CSV, delimiter =",", na_values=['NA','?'])
df[0:5]

## 3.2 CHECKING FOR NULL VALUES


In [None]:
df.isnull().sum()

## 3.3 CHECKING AND REMOVING DUPLICATE VALUES

In [None]:
df.duplicated().any()

In [None]:
dups = df.duplicated()

print('Number of duplicate rows = %d' % (dups.sum()))

In [None]:
df_Dups = df.copy()
df_Dups['is_duplicated'] = df_Dups.duplicated(keep='first')
count_dups = df_Dups['is_duplicated'].value_counts().reset_index()
count_dups.columns = ['col', 'count']
print(count_dups)

In [None]:
df.drop_duplicates()

# 4.0 DATA PREPROCESSING

## 4.1 NORMALIZING NUMERIC FEATURES

In [None]:
encode_text_dummy(df, 'sex')
encode_text_dummy(df, 'chest pain type')
encode_text_dummy(df, 'fasting blood sugar')
encode_text_dummy(df, 'resting ecg')
encode_text_dummy(df, 'exercise angina')
encode_text_dummy(df, 'ST slope')
#encode_text_dummy(df, 'target')

In [None]:
x=df.drop(["target"],axis=1)
y=df.target

In [None]:
x.shape

In [None]:
y.shape

In [None]:
xn1 = zscore(x)
xn1.shape
xn1

In [None]:
x1_numpy = xn1.to_numpy()
y1_numpy = y.to_numpy()

In [None]:
y1_numpy.shape

## 4.2 TRAIN/TEST SPLIT

In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1_numpy, y1_numpy, test_size=0.25, random_state=42)

# 5.0 BUILDING MODELS

## 5.1 NEAREST NEIGHBOR (NN)

In [None]:
k = 5
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(x1_train, y1_train)

# Make predictions on the test data
y1_pred_knn = knn_classifier.predict(x1_test)

# Display classification report for k-NN
classification_rep1_knn = classification_report(y1_test, y1_pred_knn)
print("Classification Report for k-NN (5):")
print(classification_rep1_knn)

# Plot confusion matrix for k-NN
confusion1_knn = confusion_matrix(y1_test, y1_pred_knn)

plt.figure()
plt.title("Confusion Matrix for k-NN (5)")
sns.heatmap(confusion1_knn, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

#Plot roc
fprn, tprn, thresholds = roc_curve(y1_test,y1_pred_knn)
roc_auc = auc(fprn, tprn)
plt.figure()
plt.plot(fprn, tprn, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

## 5.2 SUPPORT VECTOR MACHINE

In [None]:
svm_Model = SVC(kernel='poly')
svm_Model.fit(x1_train, y1_train)
y1_pred_svm = svm_Model.predict(x1_test)

classification_rep1_svm = classification_report(y1_test, y1_pred_svm)
print("Classification Report for SVM")
print(classification_rep1_svm)

confusion1_svm = confusion_matrix(y1_test,y1_pred_svm)

plt.figure()
plt.title("Confusion Matrix for SVM ")
sns.heatmap(confusion1_svm, annot=True, fmt="d", cmap="Reds")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

fprn, tprn, thresholds = roc_curve(y1_test,y1_pred_svm)
roc_auc = auc(fprn, tprn)
plt.figure()
plt.plot(fprn, tprn, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

## 5.3 FULLY CONNECTED NEURAL NETWORKS

### 5.3.1 BASIC FULLY CONNECTED NEURAL NETWORK

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Reshape your training and testing data
y1_train_encoded = encoder.fit_transform(y1_train.reshape(-1, 1))
y1_test_encoded = encoder.transform(y1_test.reshape(-1, 1))

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
from tensorflow.keras.callbacks import TensorBoard

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

checkpointer = ModelCheckpoint(filepath="dnn/best_weights1.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
  Nmodel = Sequential()
  Nmodel.add(Dense(64, activation='relu'))
  Nmodel.add(Dense(32, activation='relu'))
  Nmodel.add(Dense(16, activation='relu'))
  Nmodel.add(Dense(4, activation='relu'))
  Nmodel.add(Dense(2, activation='softmax')) # Output

  Nmodel.compile(loss='categorical_crossentropy', optimizer='adam')

  monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

  Nmodel.fit(x1_train, y1_train_encoded, validation_data=(x1_test, y1_test_encoded),
               callbacks=[monitor, checkpointer, TensorBoard(log_dir=log_dir, histogram_freq=1)],
               verbose=1, epochs=1000)

Nmodel.load_weights("dnn/best_weights1.hdf5")

y1_pred_prob1 = Nmodel.predict(x1_test)

# Convert predicted probabilities to binary labels
y1_pred1 = np.argmax(y1_pred_prob1, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true1 = np.argmax(y1_test_encoded, axis=1)

In [None]:
# Classification Report
classification_rep1_fncc1 = classification_report(y1_true1, y1_pred1)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc1)

# Confusion Matrix
confusion1_fncc1 = confusion_matrix(y1_true1, y1_pred1)
plt.figure()
plt.imshow(confusion1_fncc1, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix for FCNN')
sns.heatmap(confusion1_fncc1, annot=True, fmt="d", cmap="Greens")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true1, y1_pred1)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


### 5.3.2 FULLY CONNECTED NEURAL NETWORK (RELU AND ADAM)

In [None]:
!pip install keras-tuner

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
import keras_tuner as kt

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

checkpointer = ModelCheckpoint(filepath="dnn/best_weights2.hdf5", verbose=0, save_best_only=True) # save best model

def build_model2(hp):
  model = tf.keras.Sequential()

  # Tune the number of hidden layers and units per layer
  for i in range(hp.Int('num_layers', min_value=1, max_value=6)):

    model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32), activation='relu'))

    # Output layer with softmax activation
    model.add(Dense(2, activation='softmax'))

    # Tune the learning rate
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Compile the model with the tuned learning rate and Adam optimizer
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),loss='categorical_crossentropy', metrics=['accuracy'])

    return model

tuner = kt.BayesianOptimization(
    build_model2,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=1,
    overwrite=True,
)

tuner.search(x1_train, y1_train_encoded, epochs=10, validation_data=(x1_test, y1_test_encoded),
             callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal combination of hyperparameters
print("Optimal Hyperparameters:")
print(f"Number of Hidden Layers: {best_hps.get('num_layers')}")
for i in range(best_hps.get('num_layers')):
    units_key = 'units_' + str(i)
    if units_key in best_hps:
        print(f"Layer {i + 1} - Units: {best_hps.get(units_key)}")
print(f"Learning Rate: {best_hps.get('learning_rate')}")

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

# Build the final model with the best hyperparameters
final_model = tuner.hypermodel.build(best_hps)

# Train the final model with the best hyperparameters
final_model.fit(x1_train, y1_train_encoded, validation_data=(x1_test, y1_test_encoded),
                 callbacks=[monitor, checkpointer, TensorBoard(log_dir=log_dir, histogram_freq=1)],
                 verbose=2, epochs=1000)

# Load the best weights
final_model.load_weights("dnn/best_weights2.hdf5")

# Make predictions on the test data
y1_pred_prob2 = final_model.predict(x1_test)

# Convert predicted probabilities to binary labels
y1_pred2 = np.argmax(y1_pred_prob2, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true2 = np.argmax(y1_test_encoded, axis=1)


In [None]:
# Classification Report
classification_rep1_fncc2 = classification_report(y1_true2, y1_pred2)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc2)

# Confusion Matrix
confusion1_fncc2 = confusion_matrix(y1_true2, y1_pred2)
plt.figure(figsize=(5, 5))
plt.imshow(confusion1_fncc2, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix for FCNN (RELU & ADAM)')
sns.heatmap(confusion1_fncc2, annot=True, fmt="d", cmap="Reds")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true2, y1_pred2)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

### 5.3.3 FULLY CONNECTED NEURAL NETWORK (TANH AND SGD)

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Define the best weights checkpoint
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="dnn/best_weights3.hdf5", verbose=0, save_best_only=True)

# Define the model architecture without specifying hyperparameters
def build_model_sgd_tanh(hp):
    model = tf.keras.Sequential()

    # Tune the number of hidden layers and units per layer
    for i in range(hp.Int('num_layers', min_value=1, max_value=12)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                        activation='tanh'))  # Change activation to 'tanh'

    # Output layer with softmax activation
    model.add(Dense(2, activation='softmax'))

    # Define optimizer as SGD with tunable learning rate
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

tuner_sgd_tanh = kt.BayesianOptimization(
    build_model_sgd_tanh,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=1,
    #directory="keras_tuner_sgd_tanh",
    overwrite=True,
)

tuner_sgd_tanh.search(x1_train, y1_train_encoded, epochs=10, validation_data=(x1_test, y1_test_encoded),
                      callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

best_hps_sgd_tanh = tuner_sgd_tanh.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal combination of hyperparameters for SGD and tanh
print("Optimal Hyperparameters for SGD and tanh:")
print(f"Number of Hidden Layers: {best_hps_sgd_tanh.get('num_layers')}")
for i in range(best_hps_sgd_tanh.get('num_layers')):
    units_key = 'units_' + str(i)
    if units_key in best_hps_sgd_tanh:
        print(f"Layer {i + 1} - Units: {best_hps_sgd_tanh.get(units_key)}")
print(f"Learning Rate: {best_hps_sgd_tanh.get('learning_rate')}")

# Build the final model with the best hyperparameters for SGD and tanh
final_model_sgd_tanh = tuner_sgd_tanh.hypermodel.build(best_hps_sgd_tanh)

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

# Train the final model with the best hyperparameters for SGD and tanh
final_model_sgd_tanh.fit(x1_train, y1_train_encoded, validation_data=(x1_test, y1_test_encoded),
                         callbacks=[monitor, checkpointer, tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)],
                         verbose=1, epochs=100)

# Load the best weights
final_model_sgd_tanh.load_weights("dnn/best_weights3.hdf5")

# Make predictions on the test data
y1_pred_prob_sgd_tanh = final_model_sgd_tanh.predict(x1_test)

# Convert predicted probabilities to binary labels
y1_pred_sgd_tanh = np.argmax(y1_pred_prob_sgd_tanh, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true_sgd_tanh = np.argmax(y1_test_encoded, axis=1)


In [None]:
# Classification Report
classification_rep1_fncc3 = classification_report(y1_true_sgd_tanh, y1_pred_sgd_tanh)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc3)

# Confusion Matrix
confusion1_fncc3 = confusion_matrix(y1_true_sgd_tanh, y1_pred_sgd_tanh)
plt.figure(figsize=(5, 5))
plt.imshow(confusion1_fncc3, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix for FCNN (TANH & SGD)')
sns.heatmap(confusion1_fncc3, annot=True, fmt="d", cmap="Greens")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true_sgd_tanh, y1_pred_sgd_tanh)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

### 5.3.4 FULLY CONNECTED NEURAL NETWORK (RELU AND SGD)

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Define the best weights checkpoint
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="dnn/best_weights4.hdf5", verbose=0, save_best_only=True)

# Define the model architecture without specifying hyperparameters
def build_model_relu_sgd(hp):
    model = tf.keras.Sequential()

    # Tune the number of hidden layers and units per layer
    for i in range(hp.Int('num_layers', min_value=1, max_value=16)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                        activation='relu'))  # Change activation to 'relu'

    # Output layer with softmax activation
    model.add(Dense(2, activation='softmax'))

    # Define optimizer as SGD with tunable learning rate
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

tuner_relu_sgd = kt.BayesianOptimization(
    build_model_relu_sgd,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=2,
    overwrite=True,
)

tuner_relu_sgd.search(x1_train, y1_train_encoded, epochs=10, validation_data=(x1_test, y1_test_encoded),
                      callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

best_hps_relu_sgd = tuner_relu_sgd.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal combination of hyperparameters for ReLU and SGD
print("Optimal Hyperparameters for ReLU and SGD:")
print(f"Number of Hidden Layers: {best_hps_relu_sgd.get('num_layers')}")
for i in range(best_hps_relu_sgd.get('num_layers')):
    units_key = 'units_' + str(i)
    if units_key in best_hps_relu_sgd:
        print(f"Layer {i + 1} - Units: {best_hps_relu_sgd.get(units_key)}")
print(f"Learning Rate: {best_hps_relu_sgd.get('learning_rate')}")

# Build the final model with the best hyperparameters for ReLU and SGD
final_model_relu_sgd = tuner_relu_sgd.hypermodel.build(best_hps_relu_sgd)

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

# Train the final model with the best hyperparameters for ReLU and SGD
final_model_relu_sgd.fit(x1_train, y1_train_encoded, validation_data=(x1_test, y1_test_encoded),
                         callbacks=[monitor, checkpointer, tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)],
                         verbose=1, epochs=1000)

# Load the best weights
final_model_relu_sgd.load_weights("dnn/best_weights4.hdf5")

# Make predictions on the test data
y1_pred_prob_relu_sgd = final_model_relu_sgd.predict(x1_test)

# Convert predicted probabilities to binary labels
y1_pred_relu_sgd = np.argmax(y1_pred_prob_relu_sgd, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true_relu_sgd = np.argmax(y1_test_encoded, axis=1)


In [None]:
# Classification Report
classification_rep1_fncc4 = classification_report(y1_true_relu_sgd, y1_pred_relu_sgd)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc4)

# Confusion Matrix
confusion1_fncc4 = confusion_matrix(y1_true_relu_sgd, y1_pred_relu_sgd)
plt.figure(figsize=(5, 5))
plt.imshow(confusion1_fncc4, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix for FCNN (RELU & SGD)')
sns.heatmap(confusion1_fncc4, annot=True, fmt="d", cmap="Greens")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true_relu_sgd, y1_pred_relu_sgd)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

### 5.5.5 FULLY CONNNECTED NEURAL NETWORK (TANH AND ADAM)

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
import keras_tuner as kt

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="dnn/best_weights5.hdf5", verbose=0, save_best_only=True)

# Define the model architecture without specifying hyperparameters
def build_model_tanh_adam(hp):
    model = tf.keras.Sequential()

    # Tune the number of hidden layers and units per layer
    for i in range(hp.Int('num_layers', min_value=1, max_value=6)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                        activation='tanh'))  # Change activation to 'tanh'

    # Output layer with softmax activation
    model.add(Dense(2, activation='softmax'))

    # Define optimizer as ADAM with tunable learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

tuner_tanh_adam = kt.BayesianOptimization(
    build_model_tanh_adam,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=3,
    overwrite=True,
)

tuner_tanh_adam.search(x1_train, y1_train_encoded, epochs=10, validation_data=(x1_test, y1_test_encoded),
                      callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

best_hps_tanh_adam = tuner_tanh_adam.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal combination of hyperparameters for TANH and ADAM
print("Optimal Hyperparameters for TANH and ADAM:")
print(f"Number of Hidden Layers: {best_hps_tanh_adam.get('num_layers')}")
for i in range(best_hps_tanh_adam.get('num_layers')):
    units_key = 'units_' + str(i)
    if units_key in best_hps_tanh_adam:
        print(f"Layer {i + 1} - Units: {best_hps_tanh_adam.get(units_key)}")
print(f"Learning Rate: {best_hps_tanh_adam.get('learning_rate')}")

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

# Build the final model with the best hyperparameters for TANH and ADAM
final_model_tanh_adam = tuner_tanh_adam.hypermodel.build(best_hps_tanh_adam)

# Train the final model with the best hyperparameters for TANH and ADAM
final_model_tanh_adam.fit(x1_train, y1_train_encoded, validation_data=(x1_test, y1_test_encoded),
                         callbacks=[monitor, checkpointer, tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)],
                         verbose=1, epochs=1000)

# Load the best weights
final_model_tanh_adam.load_weights("dnn/best_weights5.hdf5")

# Make predictions on the test data
y1_pred_prob_tanh_adam = final_model_tanh_adam.predict(x1_test)

# Convert predicted probabilities to binary labels
y1_pred_tanh_adam = np.argmax(y1_pred_prob_tanh_adam, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true_tanh_adam = np.argmax(y1_test_encoded, axis=1)



In [None]:
# Classification Report
classification_rep1_fncc5 = classification_report(y1_true_tanh_adam, y1_pred_tanh_adam)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc5)

# Confusion Matrix
confusion1_fncc5 = confusion_matrix(y1_true_tanh_adam, y1_pred_tanh_adam)
plt.figure(figsize=(5, 5))
plt.imshow(confusion1_fncc5, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix for FCNN (TANH & ADAM)')
sns.heatmap(confusion1_fncc5, annot=True, fmt="d", cmap="Greens")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true_tanh_adam, y1_pred_tanh_adam)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# 6.0 Additional Feature:Create a balanced dataset

In [None]:
!pip install imbalanced-learn

## 6.1 OVERSAMPLING

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
x=df.drop(["target"],axis=1)
y=df.target
print (Counter(y))

In [None]:
RS=RandomOverSampler()
XRS,YRS=RS.fit_resample(x,y)
print (Counter(YRS))

### 6.1.1 NORMALIZING NUMERIC FEATURES

In [None]:
xn = zscore(XRS)
xn.shape
xn

In [None]:
x_numpy=xn.to_numpy()
y_numpy =YRS.to_numpy()
x_numpy

## 6.2 TRAIN/TEST SPLIT

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_numpy, y_numpy, test_size=0.25, random_state=42)

## 6.3 MODEL SELECTION AFTER OVERSAMPLING

### 6.3.1 KNN

In [None]:
from sklearn.metrics import roc_auc_score

k = 5

# Create a k-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier on the training data
knn_classifier.fit(x_train, y_train)

# Make predictions on the test data
y_pred_knn2 = knn_classifier.predict(x_test)

# Display classification report for k-NN
classification_rep2_knn = classification_report(y_test, y_pred_knn2)
print("Classification Report for k-NN (5):")
print(classification_rep2_knn)

# Plot confusion matrix for k-NN
confusion2_knn = confusion_matrix(y_test, y_pred_knn2)

plt.figure()
plt.title("k-NN (5) After Oversampling")
sns.heatmap(confusion2_knn, annot=True, fmt="d", cmap="Greens")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot ROC curve for k-NN
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_pred_knn2)
roc_auc_knn = roc_auc_score(y_test, y_pred_knn2)

plt.figure()
plt.plot(fpr_knn, tpr_knn, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### 6.3.2 SVM

In [None]:
from sklearn.metrics import roc_auc_score

svm_classifier = SVC(kernel='poly')

svm_classifier.fit(x_train, y_train)

y_pred_svm2 = svm_classifier.predict(x_test)

# Display classification report for SVM
classification_rep2_svm = classification_report(y_test, y_pred_svm2)
print("Classification Report for SVM:")
print(classification_rep2_svm)

# Plot confusion matrix for SVM
confusion2_svm = confusion_matrix(y_test, y_pred_svm2)

plt.figure(figsize=(5, 5))
plt.title("SVM after oversampling")
sns.heatmap(confusion2_svm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot ROC curve for
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_pred_svm2)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm2)

plt.figure()
plt.plot(fpr_svm, tpr_svm, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### 6.3.3 FULLY CONNECTED NEURAL NETWORK

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
# Reshape your training and testing labels if they are 1D arrays
y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="dnn/best_weights6.hdf5", verbose=0, save_best_only=True)

# Define the model architecture without specifying hyperparameters
def build_model_tanh_adam2(hp):
    model = tf.keras.Sequential()

    # Tune the number of hidden layers and units per layer
    for i in range(hp.Int('num_layers', min_value=1, max_value=16)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                        activation='tanh'))  # Change activation to 'tanh'

    # Output layer with softmax activation
    model.add(Dense(2, activation='softmax'))

    # Define optimizer as ADAM with tunable learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

tuner_tanh_adam = kt.BayesianOptimization(
    build_model_tanh_adam2,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=1,
    overwrite=True,
)

tuner_tanh_adam.search(x_train, y_train_encoded, epochs=10, validation_data=(x_test, y_test_encoded),
                      callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

best_hps_tanh_adam = tuner_tanh_adam.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal combination of hyperparameters for TANH and ADAM
print("Optimal Hyperparameters for TANH and ADAM:")
print(f"Number of Hidden Layers: {best_hps_tanh_adam.get('num_layers')}")
for i in range(best_hps_tanh_adam.get('num_layers')):
    units_key = 'units_' + str(i)
    if units_key in best_hps_tanh_adam:
        print(f"Layer {i + 1} - Units: {best_hps_tanh_adam.get(units_key)}")
print(f"Learning Rate: {best_hps_tanh_adam.get('learning_rate')}")

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

# Build the final model with the best hyperparameters for TANH and ADAM
final_model_tanh_adam = tuner_tanh_adam.hypermodel.build(best_hps_tanh_adam)

# Train the final model with the best hyperparameters for TANH and ADAM
final_model_tanh_adam.fit(x_train, y_train_encoded, validation_data=(x_test, y_test_encoded),
                         callbacks=[monitor, checkpointer, tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)],
                         verbose=1, epochs=1000)

# Load the best weights
final_model_tanh_adam.load_weights("dnn/best_weights6.hdf5")

# Make predictions on the test data
y1_pred_prob_tanh_adam2 = final_model_tanh_adam.predict(x_test)

# Convert predicted probabilities to binary labels
y1_pred_tanh_adam2 = np.argmax(y1_pred_prob_tanh_adam, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true_tanh_adam2 = np.argmax(y1_test_encoded, axis=1)


In [None]:
# Classification Report
classification_rep1_fncc6 = classification_report(y1_true_tanh_adam2, y1_pred_tanh_adam2)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc6)

# Confusion Matrix
confusion1_fncc6 = confusion_matrix(y1_true_tanh_adam2, y1_pred_tanh_adam2)
plt.figure(figsize=(5, 5))
plt.imshow(confusion1_fncc6, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('FCNN after Oversampling')
sns.heatmap(confusion1_fncc6, annot=True, fmt="d", cmap="Greens")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true_tanh_adam2, y1_pred_tanh_adam2)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# 7.0 Additional Feature: Feature Selection

### 7.1 Lasso Model

In [None]:
# Create a Logistic Regression model with L1 regularization (Lasso)
lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=84)

# Fit the model on your data
lasso_model.fit(x_train, y_train)

# Get the coefficients (weights) of the features
feature_weights = lasso_model.coef_

# Use SelectFromModel to perform feature selection
from sklearn.feature_selection import SelectFromModel
feature_selector = SelectFromModel(lasso_model, prefit=True)

# Transform the training and test data to select important features
x_train_selected = feature_selector.transform(x_train)
x_test_selected = feature_selector.transform(x_test)

# Get the absolute feature weights and their indices
absolute_weights = np.abs(feature_weights)
sorted_feature_indices = np.argsort(absolute_weights[0])[::-1]  # Sort in descending order

# Select the top 5 feature indices
top_5_feature_indices = sorted_feature_indices[:5]

# Create NumPy ndarrays with only the top 5 selected features
x_train_top_5_features = x_train[:, top_5_feature_indices]
x_test_top_5_features = x_test[:, top_5_feature_indices]

# Print the selected top 5 feature indices
print("Selected Top 5 Feature Indices:", top_5_feature_indices)

# Print the corresponding feature weights for the top 5 features
print("Feature Weights (Coefficients) for Top 5 Features:")
for index, weight in zip(top_5_feature_indices, feature_weights[0][top_5_feature_indices]):
    print(f"Feature {index}: {weight:.4f}")


### 7.2 Knn with top 5 features

In [None]:
from sklearn.metrics import roc_auc_score

k = 5

# Create a k-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier on the training data
knn_classifier.fit(x_train_top_5_features, y_train)

# Make predictions on the test data
y_pred_knn3 = knn_classifier.predict(x_test_top_5_features)

# Display classification report for k-NN
classification_rep3_knn = classification_report(y_test, y_pred_knn3)
print("Classification Report for k-NN (5):")
print(classification_rep2_knn)

# Plot confusion matrix for k-NN
confusion3_knn = confusion_matrix(y_test, y_pred_knn3)

plt.figure()
plt.title("k-NN with top 5 features")
sns.heatmap(confusion3_knn, annot=True, fmt="d", cmap="Greens")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot ROC curve for k-NN
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_pred_knn3)
roc_auc_knn = roc_auc_score(y_test, y_pred_knn2)

plt.figure()
plt.plot(fpr_knn, tpr_knn, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### 7.3 SVM with top 5 features

In [None]:
from sklearn.metrics import roc_auc_score

svm_classifier = SVC(kernel='poly')

svm_classifier.fit(x_train_top_5_features, y_train)

y_pred_svm3 = svm_classifier.predict(x_test_top_5_features)

# Display classification report for SVM
classification_rep3_svm = classification_report(y_test, y_pred_svm3)
print("Classification Report for SVM:")
print(classification_rep3_svm)

# Plot confusion matrix for SVM
confusion3_svm = confusion_matrix(y_test, y_pred_svm3)

plt.figure()
plt.title("SVM with top 5 features")
sns.heatmap(confusion3_svm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot ROC curve for
fpr_svm3, tpr_svm3, thresholds_svm3 = roc_curve(y_test, y_pred_svm3)
roc_auc_svm3 = roc_auc_score(y_test, y_pred_svm3)

plt.figure()
plt.plot(fpr_svm3, tpr_svm3, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### 7.4 Fully connected neural network with top 5 features


In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
%tensorboard --logdir logs/fit/

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath="dnn/best_weights7.hdf5", verbose=0, save_best_only=True)

# Define the model architecture without specifying hyperparameters
def build_model_tanh_adam3(hp):
    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Input(shape=(5,)))

    # Tune the number of hidden layers and units per layer
    for i in range(hp.Int('num_layers', min_value=1, max_value=16)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                        activation='tanh'))  # Change activation to 'tanh'

    # Output layer with softmax activation
    model.add(Dense(2, activation='softmax'))

    # Define optimizer as ADAM with tunable learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

tuner_tanh_adam = kt.BayesianOptimization(
    build_model_tanh_adam3,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=1,
    overwrite=True,
)

tuner_tanh_adam.search(x_train_top_5_features, y_train_encoded, epochs=10, validation_data=(x_test_top_5_features, y_test_encoded),
                      callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

best_hps_tanh_adam = tuner_tanh_adam.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal combination of hyperparameters for TANH and ADAM
print("Optimal Hyperparameters for TANH and ADAM:")
print(f"Number of Hidden Layers: {best_hps_tanh_adam.get('num_layers')}")
for i in range(best_hps_tanh_adam.get('num_layers')):
    units_key = 'units_' + str(i)
    if units_key in best_hps_tanh_adam:
        print(f"Layer {i + 1} - Units: {best_hps_tanh_adam.get(units_key)}")
print(f"Learning Rate: {best_hps_tanh_adam.get('learning_rate')}")

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience = 5, verbose = 2, mode = 'auto')

# Build the final model with the best hyperparameters for TANH and ADAM
final_model_tanh_adam = tuner_tanh_adam.hypermodel.build(best_hps_tanh_adam)

# Train the final model with the best hyperparameters for TANH and ADAM
final_model_tanh_adam.fit(x_train_top_5_features, y_train_encoded, validation_data=(x_test_top_5_features, y_test_encoded),
                         callbacks=[checkpointer, tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1), tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)],
                         verbose=1, epochs=1000)

# Load the best weights
final_model_tanh_adam.load_weights("dnn/best_weights7.hdf5")

# Make predictions on the test data
y1_pred_prob_tanh_adam3 = final_model_tanh_adam.predict(x_test_top_5_features)

# Convert predicted probabilities to binary labels
y1_pred_tanh_adam3 = np.argmax(y1_pred_prob_tanh_adam, axis=1)

# Convert one-hot encoded test labels to binary labels
y1_true_tanh_adam3 = np.argmax(y1_test_encoded, axis=1)


In [None]:
# Classification Report
classification_rep1_fncc7 = classification_report(y1_true_tanh_adam3, y1_pred_tanh_adam3)
print("Classification Report for FCNN:\n")
print(classification_rep1_fncc7)

# Confusion Matrix
confusion1_fncc7 = confusion_matrix(y1_true_tanh_adam3, y1_pred_tanh_adam3)
plt.figure(figsize=(5, 5))
plt.imshow(confusion1_fncc7, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('FCNN with top 5 features')
sns.heatmap(confusion1_fncc7, annot=True, fmt="d", cmap="Reds")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y1_true_tanh_adam3, y1_pred_tanh_adam3)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# 8.0 Additional Feature: *K Means*

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report

# Fit K-means clustering on the training data
kmeans = KMeans(n_clusters=2, random_state=84)
kmeans.fit(x_train_top_5_features)

test_cluster_labels = kmeans.predict(x_test_top_5_features)

# Calculate cluster centroids
cluster_centers = kmeans.cluster_centers_

# Assign labels based on majority class in each cluster
cluster_labels = []
for cluster_id in range(2):
    cluster_indices = (test_cluster_labels == cluster_id)
    cluster_majority_label = np.argmax(np.bincount(y_test[cluster_indices]))
    cluster_labels.append(cluster_majority_label)

# Assign labels to the test data based on the closest centroid
predicted_labels = [cluster_labels[cluster_id] for cluster_id in test_cluster_labels]

# Calculate confusion matrix
confusion_kmeans = confusion_matrix(y_test, predicted_labels)

plt.figure()
plt.title("Confusion Matrix for K-means Clustering")
sns.heatmap(confusion_kmeans, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Calculate classification report for K-means Clustering
classification_rep_kmeans = classification_report(y_test, predicted_labels)
print("Classification Report for K-means Clustering:")
print(classification_rep_kmeans)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report

# Fit K-means clustering on the training data
kmeans = KMeans(n_clusters=2, random_state=84)
kmeans.fit(x_train)

test_cluster_labels = kmeans.predict(x_test)

# Calculate cluster centroids
cluster_centers = kmeans.cluster_centers_

# Assign labels based on majority class in each cluster
cluster_labels = []
for cluster_id in range(2):
    cluster_indices = (test_cluster_labels == cluster_id)
    cluster_majority_label = np.argmax(np.bincount(y_test[cluster_indices]))
    cluster_labels.append(cluster_majority_label)

# Assign labels to the test data based on the closest centroid
predicted_labels = [cluster_labels[cluster_id] for cluster_id in test_cluster_labels]

# Calculate confusion matrix
confusion_kmeans = confusion_matrix(y_test, predicted_labels)

plt.figure()
plt.title("Confusion Matrix for K-means Clustering")
sns.heatmap(confusion_kmeans, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Calculate classification report for K-means Clustering
classification_rep_kmeans = classification_report(y_test, predicted_labels)
print("Classification Report for K-means Clustering:")
print(classification_rep_kmeans)