In [322]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn_extra.cluster import KMedoids

# Load data from CSV file
data = pd.read_csv('Data_Process.csv', sep=';')
# Exclude the first data row

# Replace commas (,) with periods (.) and convert to float
data['Peng Sem 1'] = data['Peng Sem 1'].str.replace(',', '.').astype(float)
data['Peng Sem 2'] = data['Peng Sem 2'].str.replace(',', '.').astype(float)
data['Ket Sem 1'] = data['Ket Sem 1'].str.replace(',', '.').astype(float)
data['Ket Sem 2'] = data['Ket Sem 2'].str.replace(',', '.').astype(float)

# Extract attributes from the data
attributes = ['Nama', 'Peng Sem 1', 'Ket Sem 1', 'Peng Sem 2', 'Ket Sem 2']
X = data[attributes].values
# Perform pairwise distance calculation
distances = pairwise_distances(X[:, 1:], metric='euclidean')

distances_without_index_0 = pairwise_distances(X[1:, 1:], metric='euclidean')
# Function to calculate the total dissimilarity for a given medoid index
def total_dissimilarity(index, medoids):
    cluster_indices = np.where(medoids)[0]
    cluster_points = X[cluster_indices]
    cluster_distances = distances[cluster_indices][:, cluster_indices]
    return sum(cluster_distances)

# Function to find the best medoid with the lowest dissimilarity
def find_best_medoid(cluster_points, cluster_indices):
    best_medoid = None
    best_dissimilarity = float('inf')
    for i in range(len(cluster_points)):
        dissimilarity = total_dissimilarity(i, cluster_indices)
        if np.all(dissimilarity < best_dissimilarity):
            best_medoid = cluster_indices[i]  # Update with cluster index
            best_dissimilarity = dissimilarity
    return best_medoid, best_dissimilarity

# Perform K-Medoids clustering
k = 3  # Number of clusters
medoids_indices = KMedoids(n_clusters=k, random_state=0).fit_predict(distances)

silhouette_avg = silhouette_score(X[:, 1:], medoids_indices)
print("Silhouette Coefficient:", silhouette_avg)

# Find the best medoid for each cluster
medoids = []
for cluster_id in range(k):
    cluster_indices = np.where(medoids_indices == cluster_id)[0]
    cluster_points = X[cluster_indices]
    medoid_index, _ = find_best_medoid(cluster_points, cluster_indices)
    medoids.append(medoid_index)

print(medoids)
# Retrieve all attributes from data
all_attributes = X

# Create a new DataFrame with cluster assignments
cluster_data = pd.DataFrame(all_attributes, columns=attributes)
cluster_data['Cluster'] = medoids_indices + 1

# Divide clusters
cluster_counts = cluster_data['Cluster'].value_counts()

# Divide cluster 1 into a, b, c
cluster_1_data = cluster_data[cluster_data['Cluster'] == 1]
cluster_1_divided = np.array_split(cluster_1_data, 3)
a, b, c = cluster_1_divided

# Add class labels for cluster 1 divisions
a['Kelas Hasil'] = '7A'
b['Kelas Hasil'] = '7B'
c['Kelas Hasil'] = '7C'

# Divide cluster 2 into d, e, f
cluster_2_data = cluster_data[cluster_data['Cluster'] == 2]
cluster_2_divided = np.array_split(cluster_2_data, 3)
d, e, f = cluster_2_divided

# Add class labels for cluster 2 divisions
d['Kelas Hasil'] = '7D'
e['Kelas Hasil'] = '7E'
f['Kelas Hasil'] = '7F'

# Divide cluster 3 into g, h
cluster_3_data = cluster_data[cluster_data['Cluster'] == 3]
cluster_3_divided = np.array_split(cluster_3_data, 2)
g, h = cluster_3_divided

# Add class labels for cluster 3 divisions
g['Kelas Hasil'] = '7G'
h['Kelas Hasil'] = '7H'

# Save the divided clusters to a single CSV file
output_data = pd.concat([a, b, c, d, e, f, g, h], axis=0)

output_data = output_data.sort_index()
data['Kelas Hasil'] = output_data['Kelas Hasil']
data['Cluster'] = output_data['Cluster']

Silhouette Coefficient: 0.37059024420357656
[0, 4, 1]


In [326]:
import numpy as np
import pandas as pd

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def k_medoids(X, k, max_iterations=100):
    n_samples, n_features = X.shape

    # Randomly initialize k medoids
    medoids_indices = np.random.choice(n_samples, k, replace=False)
    medoids = X[medoids_indices]
    medoids2 = medoids.copy()  # Initialize medoids2 with the same values as medoids

    for _ in range(max_iterations):
        # Step 2: Assign points to the nearest medoids
        clusters = [[] for _ in range(k)]
        for i, x in enumerate(X):
            distances = [euclidean_distance(x, medoid) for medoid in medoids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(i)

        # Step 3: Update medoids
        new_medoids = medoids.copy()
        new_medoids2 = medoids2.copy()  # Create a new set of medoids

        for i, cluster in enumerate(clusters):
            cluster_points = X[cluster]

            # Randomly select a new medoid index from the cluster
            new_medoid_idx = np.random.choice(cluster)

            # Randomly select a new medoid index from the cluster for medoids2
            new_medoid_idx_2 = np.random.choice(cluster)

            # Update both sets of medoids
            new_medoids[i] = X[new_medoid_idx]
            new_medoids2[i] = X[new_medoid_idx_2]

        # Check if medoids2 is the same as medoids, and recompute if necessary
        if np.all(new_medoids2 == new_medoids):
            continue

        medoids = new_medoids
        medoids2 = new_medoids2

    return medoids, medoids2, clusters

# Load data from CSV file
data = pd.read_csv('Data_Process.csv', sep=';')

# Replace commas (,) with periods (.) and convert to float
data['Peng Sem 1'] = data['Peng Sem 1'].str.replace(',', '.').astype(float)
data['Peng Sem 2'] = data['Peng Sem 2'].str.replace(',', '.').astype(float)
data['Ket Sem 1'] = data['Ket Sem 1'].str.replace(',', '.').astype(float)
data['Ket Sem 2'] = data['Ket Sem 2'].str.replace(',', '.').astype(float)

# Extract attributes from the data
attributes = ['Nama', 'Peng Sem 1', 'Ket Sem 1', 'Peng Sem 2', 'Ket Sem 2']
X = data[attributes].values

# Perform pairwise distance calculation using the k_medoids function
k = 3  # Set the number of clusters (you can change this value as needed)
max_iterations = 2
medoids, medoids2, clusters = k_medoids(X[:, 1:], k, max_iterations)  # Exclude the first column ('Nama') before clustering

print("Medoids:")
print(medoids)
print("\n")
print("Medoids 2 : ")
print(medoids2)
print("\nClusters:")
for i, cluster in enumerate(clusters):
    print(f"Cluster {i+1}: {cluster}")

# Print the counts from each cluster
print("\nCluster Counts:")
for i, count in enumerate(cluster_counts):
    print(f"Cluster {i+1}: {count} data points")

# Calculate and print the total Euclidean distance for each medoid
print("\nTotal Euclidean Distance from Each Medoid:")
for i, medoid in enumerate(medoids):
    cluster_points = X[clusters[i]][:, 1:]  # Exclude the first column ('Nama')
    total_distance = 0
    for point in cluster_points:
        total_distance += euclidean_distance(point, medoid)
    print(f"Medoid {i+1}: {total_distance}")

Medoids:
[[78.8 77.5 82.0 81.9]
 [75.8 75.6 78.5 77.5]
 [78.3 79.8 80.6 80.3]]


Medoids 2 : 
[[79.7 78.8 83.8 83.9]
 [76.9 77.0 78.9 79.1]
 [81.2 80.1 81.2 80.1]]

Clusters:
Cluster 1: [0, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 17, 18, 19, 20, 21, 25, 27, 28, 29, 30, 31, 34, 35, 36, 37, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 56, 57, 58, 60, 61, 65, 67, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 90, 91, 92, 93, 94, 98, 100, 104, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 163, 164, 166, 167, 168, 169, 172, 174, 175, 176, 178, 179, 180, 182, 183, 185, 187, 188, 189, 190, 194, 203, 205, 209, 210, 213, 215, 216, 218, 219, 221, 224, 230, 232, 233, 235, 245, 246]
Cluster 2: [1, 12, 16, 22, 23, 32, 33, 46, 54, 55, 59, 62, 63, 64, 68, 69, 70, 73, 85, 89, 123, 124, 126, 160, 165, 170, 171, 173, 177, 181, 184, 186, 191, 197, 199, 207, 220, 