In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn_extra.cluster import KMedoids

# Load data from CSV file
data = pd.read_csv('Data_Process.csv', sep=';')
# Exclude the first data row

# Replace commas (,) with periods (.) and convert to float
data['Peng Sem 1'] = data['Peng Sem 1'].str.replace(',', '.').astype(float)
data['Peng Sem 2'] = data['Peng Sem 2'].str.replace(',', '.').astype(float)
data['Ket Sem 1'] = data['Ket Sem 1'].str.replace(',', '.').astype(float)
data['Ket Sem 2'] = data['Ket Sem 2'].str.replace(',', '.').astype(float)

# Extract attributes from the data
attributes = ['Nama', 'Peng Sem 1', 'Ket Sem 1', 'Peng Sem 2', 'Ket Sem 2']
X = data[attributes].values
# Perform pairwise distance calculation
distances = pairwise_distances(X[:, 1:], metric='euclidean')

print(distances)
# Function to calculate the total dissimilarity for a given medoid index
def total_dissimilarity(index, medoids):
    cluster_indices = np.where(medoids)[0]
    cluster_points = X[cluster_indices]
    cluster_distances = distances[cluster_indices][:, cluster_indices]
    return sum(cluster_distances)

# Function to find the best medoid with the lowest dissimilarity
def find_best_medoid(cluster_points, cluster_indices):
    best_medoid = None
    best_dissimilarity = float('inf')
    for i in range(len(cluster_points)):
        dissimilarity = total_dissimilarity(i, cluster_indices)
        print(dissimilarity)
        if np.all(dissimilarity < best_dissimilarity):
            best_medoid = cluster_indices[i]  # Update with cluster index
            best_dissimilarity = dissimilarity
            print(best_dissimilarity)
    return best_medoid, best_dissimilarity

# Perform K-Medoids clustering
k = 3  # Number of clusters
medoids_indices = KMedoids(n_clusters=k, random_state=0).fit_predict(distances)

silhouette_avg = silhouette_score(distances, medoids_indices)
print("Silhouette Coefficient:", silhouette_avg)

# Find the best medoid for each cluster
medoids = []
for cluster_id in range(k):
    cluster_indices = np.where(medoids_indices == cluster_id)[0]
    cluster_points = X[cluster_indices]
    medoid_index, _ = find_best_medoid(cluster_points, cluster_indices)
    medoids.append(medoid_index)

# Retrieve all attributes from data
all_attributes = X

# Create a new DataFrame with cluster assignments
cluster_data = pd.DataFrame(all_attributes, columns=attributes)
cluster_data['Cluster'] = medoids_indices + 1

# Divide clusters
cluster_counts = cluster_data['Cluster'].value_counts()

# Divide cluster 1 into a, b, c
cluster_1_data = cluster_data[cluster_data['Cluster'] == 1]
cluster_1_divided = np.array_split(cluster_1_data, 3)
a, b, c = cluster_1_divided

# Add class labels for cluster 1 divisions
a['Kelas Hasil'] = '7A'
b['Kelas Hasil'] = '7B'
c['Kelas Hasil'] = '7C'

# Divide cluster 2 into d, e, f
cluster_2_data = cluster_data[cluster_data['Cluster'] == 2]
cluster_2_divided = np.array_split(cluster_2_data, 3)
d, e, f = cluster_2_divided

# Add class labels for cluster 2 divisions
d['Kelas Hasil'] = '7D'
e['Kelas Hasil'] = '7E'
f['Kelas Hasil'] = '7F'

# Divide cluster 3 into g, h
cluster_3_data = cluster_data[cluster_data['Cluster'] == 3]
cluster_3_divided = np.array_split(cluster_3_data, 2)
g, h = cluster_3_divided

# Add class labels for cluster 3 divisions
g['Kelas Hasil'] = '7G'
h['Kelas Hasil'] = '7H'

# Save the divided clusters to a single CSV file
output_data = pd.concat([a, b, c, d, e, f, g, h], axis=0)

output_data = output_data.sort_index()
data['Kelas Hasil'] = output_data['Kelas Hasil']
data['Cluster'] = output_data['Cluster']

[[ 0.          9.96343314  2.61342687 ...  7.22218803  2.41660919
   1.        ]
 [ 9.96343314  0.         12.21065109 ...  3.60970913  9.34184136
   9.96343314]
 [ 2.61342687 12.21065109  0.         ...  9.31396801  3.24807635
   2.26936114]
 ...
 [ 7.22218803  3.60970913  9.31396801 ...  0.          6.26737585
   7.15821207]
 [ 2.41660919  9.34184136  3.24807635 ...  6.26737585  0.
   1.66132477]
 [ 1.          9.96343314  2.26936114 ...  7.15821207  1.66132477
   0.        ]]
Silhouette Coefficient: 0.3884462022552461
[ 618.04036571  566.29632042  538.53398798  389.29748318  522.73604152
  406.30123427 1015.58395942  801.95264789  402.66147707  503.113858
  523.37886153  578.18695864  429.47375705  648.84395431  436.85003091
  604.75951006  537.54842424  406.94136091 1088.3730004   414.70416836
  379.92334873  474.81566681  618.93833832  466.81131378  637.7366411
  469.15762657  421.74009266  493.90793673  384.92525991  935.0594304
  383.32915399  507.13757471  741.75929627  393.708