In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
# Load the data
data = pd.read_csv('datapemilukpu-dki-2009.csv')



  and should_run_async(code)


In [61]:
# Data Preprocessing
data['Suara Sah Partai'] = pd.to_numeric(data['Suara Sah Partai'], errors='coerce')
data['Suara Sah Caleg'] = pd.to_numeric(data['Suara Sah Caleg'], errors='coerce')



  and should_run_async(code)


In [62]:
# Exploratory Data Analysis
print(data.describe())



                no  No Urut Parpol  Suara Sah Partai  Jumlah Perolehan Kursi  \
count  2268.000000     2268.000000       2262.000000             2268.000000   
mean   1134.500000       18.888007      34897.528736                4.510141   
std     654.859527       11.407895      63700.110360                7.774240   
min       1.000000        1.000000        207.000000                0.000000   
25%     567.750000        8.000000       2530.000000                0.000000   
50%    1134.500000       20.000000       5826.000000                0.000000   
75%    1701.250000       28.000000      35464.000000                6.000000   
max    2268.000000       44.000000     338396.000000               32.000000   

       No Urut Caleg  Suara Sah Caleg  
count    2268.000000      2253.000000  
mean        8.897707       936.880160  
std         6.916168      2475.027225  
min         1.000000         0.000000  
25%         4.000000        67.000000  
50%         7.000000       207.000000  

  and should_run_async(code)


In [63]:
# Select features for clustering
features = ['Suara Sah Partai', 'Suara Sah Caleg']
X = data[features]


  and should_run_async(code)


In [64]:
# Instead of dropping NaN values, we'll fill them with the mean
X = X.fillna(X.mean())

  and should_run_async(code)


In [65]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

  and should_run_async(code)


In [66]:
# Function to compute WCSS (Within-Cluster Sum of Squares)
def compute_wcss(data):
    wcss = []
    for n in range(1, 11):
        kmeans = KMeans(n_clusters=n, random_state=42)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    return wcss

  and should_run_async(code)


In [67]:
# Compute WCSS for different numbers of clusters
wcss = compute_wcss(X_scaled)

  and should_run_async(code)


In [68]:
# Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('elbow_curve.png')
plt.close()

  and should_run_async(code)


In [69]:
# Choose the optimal number of clusters (let's say 3 for this example)
n_clusters = 3

  and should_run_async(code)


In [70]:
# Apply K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

  and should_run_async(code)


In [71]:
# Add cluster labels to the original dataframe
data['Cluster'] = cluster_labels

  and should_run_async(code)


In [72]:
# Visualize the clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(data['Suara Sah Partai'], data['Suara Sah Caleg'],
                      c=data['Cluster'], cmap='viridis', alpha=0.7)
plt.title('K-means Clustering of Candidates')
plt.xlabel('Suara Sah Partai')
plt.ylabel('Suara Sah Caleg')
plt.colorbar(scatter)
plt.savefig('kmeans_clustering.png')
plt.close()

  and should_run_async(code)


In [73]:
# Calculate silhouette score
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print(f"The average silhouette score is: {silhouette_avg}")


The average silhouette score is: 0.8048208231037867


  and should_run_async(code)


In [74]:
# Analyze clusters
for i in range(n_clusters):
    cluster_data = data[data['Cluster'] == i]
    print(f"\nCluster {i}:")
    print(f"Number of candidates: {len(cluster_data)}")
    print(f"Average party votes: {cluster_data['Suara Sah Partai'].mean():.2f}")
    print(f"Average candidate votes: {cluster_data['Suara Sah Caleg'].mean():.2f}")
    print("Top parties:")
    print(cluster_data['Nama Partai'].value_counts().head())
    print("\n---")


Cluster 0:
Number of candidates: 2087
Average party votes: 18756.54
Average candidate votes: 537.50
Top parties:
Nama Partai
Partai Hati Nurani Rakyat                112
 Partai Golkar                           111
Partai Amanat Nasional                   110
Partai Demokrasi Indonesia Perjuangan    106
 Partai Persatuan Pembangunan            103
Name: count, dtype: int64

---

Cluster 1:
Number of candidates: 148
Average party votes: 231322.53
Average candidate votes: 3240.41
Top parties:
Nama Partai
Partai Demokrat                          96
Partai Keadilan Sejahtera                50
Partai Demokrasi Indonesia Perjuangan     2
Name: count, dtype: int64

---

Cluster 2:
Number of candidates: 33
Average party votes: 171821.58
Average candidate votes: 15764.24
Top parties:
Nama Partai
Partai Demokrat                          17
Partai Keadilan Sejahtera                 8
Partai Demokrasi Indonesia Perjuangan     4
 Partai Golkar                            2
Partai Kebangkitan Bangsa

  and should_run_async(code)


In [75]:
# Save clustered data
data.to_csv('clustered_election_data.csv', index=False)

  and should_run_async(code)
