IMPORT LIBRARY AND DATASET

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from kneed import KneeLocator

In [None]:
df = pd.read_csv('kendaraan_train.csv')
df

DATA EXPLORARTION AND PREPROCESSING DATA

In [None]:
# drop column 'id'
df_pre = df.drop('id', axis=1)

# check the duplicate rows and missing values
print('Duplicated row count: %d' %df_pre.duplicated().sum())
print('Missing values\n%s' %df_pre.isna().sum())

In [None]:
# drop the duplicate rows and missing values
df_pre = df_pre.drop_duplicates()
df_pre = df_pre.dropna()
df_pre

In [None]:
# check features datatype
df_pre.dtypes

In [None]:
# Encoding the data in column 'Jenis_Kelamin', 'Umur_Kendaraan', and 'Kendaraan_Rusak'
object_columns = ['Jenis_Kelamin', 'Umur_Kendaraan', 'Kendaraan_Rusak']
for column in object_columns:
    df_pre[column] = LabelEncoder().fit_transform(df_pre[column])
df_default = df_pre
df_pre

In [None]:
# Scaler the data with standard scaler
df_pre = StandardScaler().fit_transform(df_pre)
df_pre = pd.DataFrame(df_pre)
df_pre

In [None]:
# convert the multiple columns into two columns with PCA
pca = PCA(n_components=2)
df_pre = pca.fit_transform(df_pre)
df_pre = pd.DataFrame(df_pre)
df_pre

In [None]:
# change the dataframe into list
X = df_pre.values
X

CLUSTERING, K-MEANS (TASK 1)

In [None]:
class KMeans:
  def __init__(self, k, max_iter=100, tol=0.001):
    self.k = k
    self.max_iter = max_iter
    self.tol = tol

  # calculate the distance using euclidean distance method
  def euclidean_dist(self,x1,x2):
    return np.linalg.norm(x1-x2)

  # use randomly initilialize the centroids
  def initialize_centroids(self, data):
    init_centroids = []
    idx = []
    for _ in range(self.k):
      idx.append(np.random.randint(0,data.shape[0]))
    init_centroids = (X[idx])
    return init_centroids

  def fit(self, data):
    self.clusters = {}
    #initialize the centroids randomly
    self.centroids = self.initialize_centroids(data)
    #assign each row to a centroid and recalculate centroids until there is no change or we reach the most iterations we wanna do
    iter = 0
    while iter < self.max_iter:
      #create the clusters (empty in the beginning)
      self.clusters.clear()
      
      #fill the clusters by adding the appropriate row to the cluster associated with the closest centroid
      for row in data:
        dist = []
        for i in range(len(self.centroids)):
            dist.append(self.euclidean_dist(self.centroids[i],row))
        idx = dist.index(min(dist))
        self.clusters.setdefault(idx,[]).append(list(row))
      
      #store the previous centroids
      old_centroids = self.centroids.copy()

      #recalculate the new centroids
      for centroid in range(len(self.centroids)):
        self.centroids[centroid] = np.average(self.clusters[centroid],axis=0)

      #check if the centroids have moved according to the amount of slack
      diff = []
      for centroid in range(len(self.centroids)):
        old_centroid = old_centroids[centroid]
        diff.append(np.sum(abs((self.centroids[centroid]-old_centroid))))

      if sum(diff) <= self.tol:
        break
      
      #increment number of iterations    
      iter += 1

    print("Iterations:", iter)
    for k in range(self.k):
        self.clusters[k] = np.array(self.clusters[k])
  
  # to find the sum of square errors
  def inertia(self):
    errors = []
    for i in range(len(self.centroids)):
      cluster_error = 0
      for j in range(len(self.clusters[i])):
        cluster_error += self.euclidean_dist(self.centroids[i],self.clusters[i][j])
      errors.append(cluster_error)
    return sum(errors)

In [None]:
# to plot the elbow method
def plot_elbow(k_range, sse):
  plt.figure(figsize=(20,10))
  plt.plot(list(k_range),sse,marker='o',markerfacecolor='orange',markersize=10,lw=5,color='blue')
  plt.title('Elbow Curve')
  plt.xlabel('Number of Clusters')
  plt.ylabel('SSE')
  plt.show()

In [None]:
# to plot the cluster model
def plot_clusters(model, xl, yl):
  colors = ["red","green","blue","cyan","magenta","yellow","pink","orange","purple","brown"]
  labels = ['Cluster-1','Cluster-2','Cluster-3','Cluster-4','Cluster-5','Cluster-6','Cluster-7','Cluster-8','Cluster-9','Cluster-10']
  plt.figure(figsize=(20,10))
  for k in range(len(model.centroids)):
    plt.scatter(model.clusters[k][:,0], model.clusters[k][:,1],c=colors[k],label=labels[k], s = 30, alpha=0.4)
  plt.scatter(model.centroids[:,0], model.centroids[:,1],c="black",label="Centroid", s = 150, marker = "X")
  plt.xlabel(xl)
  plt.ylabel(yl)
  plt.legend()
  plt.show

In [None]:
# form the model from the dataset with range k is 2-10
k_models = []
k_range = range(2,11)
for i in k_range:
    print("K =",i)
    model = KMeans(k=i,max_iter=100,tol=0.001)
    model.fit(X)
    k_models.append(model)

In [None]:
# plot the elbow method based on inertia of the models
sse = [k.inertia() for k in k_models]
plot_elbow(k_range,sse)

In [None]:
# find the elbow curve using kneelocator
kl = KneeLocator(k_range,sse,curve="convex",direction="decreasing")
opt = kl.elbow
print("Optimal K:", opt)

In [None]:
# x and y label
xl = 'PCA-0'
yl = 'PCA-1'

# plot the original data with pca
plt.figure(figsize=(20,10))
plt.scatter(X[:,0],X[:,1], s = 30, alpha=0.4)
plt.xlabel(xl)
plt.ylabel(yl)
plt.show()

In [None]:
# plot the model with optimal k
opt_k = k_models[opt-2]
plot_clusters(opt_k,xl,yl)
print(opt_k.centroids)

In [None]:
# Collecting the explored data in a dataframe
temp = []
conc = np.vstack(opt_k.clusters)
for i in range(len(conc)):
  lst = (opt_k.clusters[conc[i][0]]).tolist()
  for j in range(len(lst)):
    lst[j].append(conc[i][0]+1)
  temp.append(lst)

frames=[]
for v in range(len(temp)):
  temp2 = (pd.DataFrame(temp[v], columns = [xl, yl,'Cluster']))
  frames.append(temp2)
result = pd.concat(frames,ignore_index = True)

result

In [None]:
# Sort the data by default data
result = result.set_index(xl)
result = result.reindex(index=df_pre[0])
result = result.reset_index()
result = result.rename(columns = {0:xl})
result

In [None]:
# assign 'Cluster' label into default data
df_default['Cluster'] = result['Cluster'].values
df_default['Cluster'] = df_default['Cluster'].astype(int)
df_default

In [None]:
# group the cluster value and see the mean value of each of the attributes in the dataset using the 'mean' method
demographic = df_default.groupby('Cluster').mean()
demographic

In [None]:
demographic.to_csv('export_demo_cluster_kendaraan.csv', index=True, header=True)

In [None]:
# export the explored data into csv file
df_default.to_csv('export_cluster_kendaraan.csv', index=False, header=True)