In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from mlxtend.plotting import plot_decision_regions
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial import Voronoi, voronoi_plot_2d

In [None]:
df = pd.read_csv("data/sobar-72.csv")
display(df.head())
display(df.info())

In [None]:
cols = [
    'behavior_sexualRisk', 'intention_aggregation', 'attitude_consistency',
    'norm_significantPerson', 'perception_vulnerability', 'motivation_strength',
    'socialSupport_emotionality', 'empowerment_knowledge', 'ca_cervix'
]
sns.pairplot(df[cols], hue='ca_cervix')
plt.show()

In [None]:
X = df.drop(columns=['ca_cervix'])
y = df['ca_cervix']

In [None]:
X_scaled = StandardScaler().fit_transform(X)
X_pca = PCA(n_components=2).fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.4, random_state=4)

In [None]:
similarities = ['euclidean', 'manhattan', 'minkowski']
for sim in similarities:
    knn = KNeighborsClassifier(n_neighbors=5, metric=sim)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    plot_decision_regions(X_train, np.array(y_train), clf=knn, legend=2)
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title(f'KNN with K=5 using {sim.capitalize()} Distance')
    plt.show()

In [None]:
k_values, accuracies = [], []
for k in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    k_values.append(k)
    accuracies.append(metrics.accuracy_score(y_test, y_pred))

plt.figure(figsize=(4, 2))
plt.plot(k_values, accuracies, marker='o')
plt.title('K Value vs Accuracy')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

In [None]:
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='coolwarm', edgecolors='k')
plt.title('Test Data Predictions')
plt.grid()
plt.show()

In [None]:
unique_labels = np.unique(y)
label_mapping = {label: i for i, label in enumerate(unique_labels)}
y_mapped = np.array([label_mapping[label] for label in y])

linked = linkage(X_pca, method='single')

plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', labels=y_mapped, show_leaf_counts=True)
plt.title('Dendrogram for KNN')
plt.xlabel('Data Points')
plt.ylabel('Distance')
plt.show()

In [None]:
vor = Voronoi(X_pca)
fig = voronoi_plot_2d(vor, show_vertices=False, line_colors='black', line_width=2, line_alpha=0.6, point_size=10)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', s=50, edgecolors='k')
plt.title('Voronoi Diagram with Target Variable')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()