In [464]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn_extra.cluster import KMedoids

# Load data from CSV file
data = pd.read_csv('Data_Process.csv', sep=';')
# Exclude the first data row

# Replace commas (,) with periods (.) and convert to float
data['Peng Sem 1'] = data['Peng Sem 1'].str.replace(',', '.').astype(float)
data['Peng Sem 2'] = data['Peng Sem 2'].str.replace(',', '.').astype(float)
data['Ket Sem 1'] = data['Ket Sem 1'].str.replace(',', '.').astype(float)
data['Ket Sem 2'] = data['Ket Sem 2'].str.replace(',', '.').astype(float)

# Extract attributes from the data
attributes = ['Nama', 'Peng Sem 1', 'Ket Sem 1', 'Peng Sem 2', 'Ket Sem 2']
X = data[attributes].values
# Perform pairwise distance calculation
distances = pairwise_distances(X[:, 1:], metric='euclidean')

# Calculate distances from medoids_1
distances_from_medoids_2 = pairwise_distances(X[medoids_1, 1:], X[:, 1:], metric='euclidean')

# Calculate distances from medoids_2
distances_from_medoids_1 = pairwise_distances(X[medoids_2, 1:], X[:, 1:], metric='euclidean')
min_distances_medoids_1 = np.min(distances_from_medoids_1, axis=0)
total_min_distances_medoids_1 = np.sum(min_distances_medoids_1)

min_distances_medoids_2 = np.min(distances_from_medoids_2, axis=0)
total_min_distances_medoids_2 = np.sum(min_distances_medoids_2)

# Function to calculate the total dissimilarity for a given medoid index
def total_dissimilarity(index, medoids):
    cluster_indices = np.where(medoids)[0]
    cluster_points = X[cluster_indices]
    cluster_distances = distances[cluster_indices][:, cluster_indices]
    return sum(cluster_distances)

# Function to find the best medoid with the lowest dissimilarity
def find_best_medoid(cluster_points, cluster_indices):
    best_medoid = None
    best_dissimilarity = float('inf')
    for i in range(len(cluster_points)):
        dissimilarity = total_dissimilarity(i, cluster_indices)
        if np.all(dissimilarity < best_dissimilarity):
            best_medoid = cluster_indices[i]  # Update with cluster index
            best_dissimilarity = dissimilarity
    return best_medoid, best_dissimilarity

def find_best_medoid_2(cluster_points, cluster_indices, medoids_1):
    best_medoid = None
    best_dissimilarity = float('inf')
    for i in range(len(cluster_points)):
        dissimilarity = total_dissimilarity(i, cluster_indices)
        # Check if the new medoid index is not in medoids_1
        if i not in medoids_1 and np.all(dissimilarity < best_dissimilarity):
            best_medoid = cluster_indices[i]  # Update with cluster index
            best_dissimilarity = dissimilarity
    return best_medoid, best_dissimilarity

# Perform K-Medoids clustering
k = 3  # Number of clusters
medoids_indices = KMedoids(n_clusters=k, random_state=0).fit_predict(distances)

print("Medoids 1 : ",total_min_distances_medoids_1)
print("Medoids 2 : ", total_min_distances_medoids_2)
print("Minus Total Distances : ", total_min_distances_medoids_2 - total_min_distances_medoids_1)
silhouette_avg = silhouette_score(X[:, 1:], medoids_indices)
print("Silhouette Coefficient:", silhouette_avg)

# Find the best medoid for each cluster
medoids = []
medoids_2 = []
for cluster_id in range(k):
    cluster_indices = np.where(medoids_indices == cluster_id)[0]
    cluster_points = X[cluster_indices]
    medoid_index, _ = find_best_medoid(cluster_points, cluster_indices)
    medoids.append(medoid_index)

# Find the best medoid for each cluster (updated)
for cluster_id in range(k):
    cluster_indices = np.where(medoids_indices == cluster_id)[0]
    cluster_points = X[cluster_indices]
    medoid_index, _ = find_best_medoid_2(cluster_points, cluster_indices, medoids)
    medoids_2.append(medoid_index)
print("Medoids 2 : ",medoids)
print("Medoids 1 : ",medoids_2)
# Retrieve all attributes from data
all_attributes = X

# Create a new DataFrame with cluster assignments
cluster_data = pd.DataFrame(all_attributes, columns=attributes)
cluster_data['Cluster'] = medoids_indices + 1

# Divide clusters
cluster_counts = cluster_data['Cluster'].value_counts()

# Divide cluster 1 into a, b, c
cluster_1_data = cluster_data[cluster_data['Cluster'] == 1]
cluster_1_divided = np.array_split(cluster_1_data, 3)
a, b, c = cluster_1_divided

# Add class labels for cluster 1 divisions
a['Kelas Hasil'] = '7A'
b['Kelas Hasil'] = '7B'
c['Kelas Hasil'] = '7C'

# Divide cluster 2 into d, e, f
cluster_2_data = cluster_data[cluster_data['Cluster'] == 2]
cluster_2_divided = np.array_split(cluster_2_data, 3)
d, e, f = cluster_2_divided

# Add class labels for cluster 2 divisions
d['Kelas Hasil'] = '7D'
e['Kelas Hasil'] = '7E'
f['Kelas Hasil'] = '7F'

# Divide cluster 3 into g, h
cluster_3_data = cluster_data[cluster_data['Cluster'] == 3]
cluster_3_divided = np.array_split(cluster_3_data, 2)
g, h = cluster_3_divided

# Add class labels for cluster 3 divisions
g['Kelas Hasil'] = '7G'
h['Kelas Hasil'] = '7H'

# Save the divided clusters to a single CSV file
output_data = pd.concat([a, b, c, d, e, f, g, h], axis=0)

output_data = output_data.sort_index()
data['Kelas Hasil'] = output_data['Kelas Hasil']
data['Cluster'] = output_data['Cluster']

Medoids 1 :  839.288062462602
Medoids 2 :  841.3857410151593
Minus Total Distances :  2.097678552557227
Silhouette Coefficient: 0.37059024420357656
Medoids 2 :  [0, 4, 1]
Medoids 1 :  [3, 9, 11]
