In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, silhouette_score
from sklearn.model_selection import train_test_split, cross_val_score

# Fetching Data << comment out if required >>
data = pd.read_csv('data.csv')

# Splitting the data into features and labels
X = data.iloc[:, :-1].values       #  [-1] Excluding the label coloumns(double-check)
y = data.iloc[:, -1].values        # Only the last coloumn

# Creating a list of k values for iteration
k_values = list(range(1, 41))

# Those are empty lists to store the validation scores and comparasion later on to find the best K
cv_scores = []
v_scores = []
f1_scores = []  # Perhaps not to be considered
silhouette_scores = []

# OPTIONAL: Initialize the list of Within-Cluster Sum of Squares (WCSS) values FOR Elbow Point Vlaidation
wcss_values = []

# Iterating over the k values, modeling, fitting and testing
for k in k_values:
   
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    y_pred = kmeans.predict(X)

    # Deplying k-nearest neighbors classifier
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    y_pred_knn = knn.predict(X)

    # Calculating the validation scores. Notice that the number of folds can be increased if required
    cv_score = np.mean(cross_val_score(kmeans, X, y, cv=7))
    v_score = np.var(y_pred == y)
    f1 = f1_score(y, y_pred_knn)
    silhouette = silhouette_score(X, y_pred)

    # Appending the scores to the previous list
    cv_scores.append(cv_score)
    v_scores.append(v_score)
    f1_scores.append(f1)
    silhouette_scores.append(silhouette)
    
    #================ This is if Elbow Point Validation is chosen to be enabled ==========#
    """#Elbow Point validation takes the instance where the pivot point of K value versus validation is above a certain value.
    #Go to Part 2 below if Yes
    wcss = kmeans.inertia_  #Calculate the WCSS value
    wcss_values.append(wcss)   # Add the WCSS value to the list
    """

# Plotting all the validation scores
plt.plot(k_values, cv_scores, label='Cross Validation')
plt.plot(k_values, v_scores, label='V-Score')
plt.plot(k_values, f1_scores, label='F1 Score')
plt.plot(k_values, silhouette_scores, label='Silhouette Score')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Validation Score')
plt.title('Validation Scores for k-Nearest Neighbor Clustering')
plt.legend()
plt.show()

# Part 2: Elbow Point Validation and plotting
""" # Find the best value of k using elbow point validation
diff_wcss_values = np.diff(wcss_values)
diff_wcss_values = diff_wcss_values / diff_wcss_values[-1]
diff_wcss_values = np.insert(diff_wcss_values, 0, 0)
elbow_point = k_values[np.argmin(diff_wcss_values)]

# Plotting the elbow point separately
plt.plot(k_values, diff_wcss_values, 'bx-')
plt.xlabel('k')
plt.ylabel('Difference in WCSS')
plt.title('Elbow Method')
plt.show()

"""
# Finding the best k value
best_k = k_values[np.argmax(cv_scores)]
print(f'The best k value is {best_k}.')

# Plot the trained clusters with different colors
kmeans = KMeans(n_clusters=best_k)
kmeans.fit(data)
plt.scatter(data.iloc[:, 0], data.iloc[:, 1], c=kmeans.labels_, cmap='rainbow')
plt.show()

# Evaluating the performance??