In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("data/sobar-72.csv")
print("Shape:", df.shape)
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
cols = ['behavior_sexualRisk', 'intention_aggregation', 'attitude_consistency',
        'norm_significantPerson', 'perception_vulnerability', 'motivation_strength',
        'socialSupport_emotionality', 'empowerment_knowledge', 'ca_cervix']
sns.pairplot(df[cols], hue='ca_cervix')

In [None]:
X = df.drop(columns=['ca_cervix'])
y = df['ca_cervix']

In [None]:
import sklearn
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mlxtend.plotting import plot_decision_regions

x = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
x = pca.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state = 4)

similarities=['euclidean','manhattan','minkowski']
for sim in similarities:
    knn = KNeighborsClassifier(n_neighbors=5, metric=sim)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("Distance Type:", sim.capitalize())
    plot_decision_regions(X_train, np.array(y_train), clf=knn, legend=2)
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title('KNN with K=5 using '+str.capitalize(sim))
    plt.show()

In [None]:
X, Y = [], []
for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = int(i))
    knn.fit(X_train, y_train)
    X.append(int(i))
    Y.append(metrics.accuracy_score(y_test, y_pred))

plt.figure(figsize=(4,2))
plt.plot(X, Y)
plt.title('K Value vs Accuracy')
plt.show()
Y

In [None]:
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
plt.grid()
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

unique_labels = np.unique(y)
label_mapping = {label: i for i, label in enumerate(unique_labels)}
new_y = np.array([label_mapping[label] for label in y])
linked = linkage(x, 'single')

plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           labels=new_y,
           show_leaf_counts=True)
plt.title('Dendrogram for KNN')
plt.xlabel('Data Points')
plt.ylabel('Distance')
plt.show()

In [None]:
from scipy.spatial import Voronoi, voronoi_plot_2d

vor = Voronoi(x)
fig = voronoi_plot_2d(vor, show_vertices=False, line_colors='black', line_width=2, line_alpha=0.6, point_size=10)
plt.scatter(x[:, 0], x[:, 1], c=y, cmap='viridis', s=50, edgecolors='black')  # Color points by target variable
plt.title('Voronoi Diagram with Target Variable')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()