In [1]:
import pandas as pd
import random as rand
from math import sqrt

In [2]:
df = pd.read_csv('./fruit_data_with_colors _1_.csv')
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192.0,8.4,7.3,0.55
1,1,apple,granny_smith,180.0,8.0,6.8,0.59
2,1,apple,granny_smith,176.0,7.4,7.2,0.6
3,2,mandarin,mandarin,86.0,6.2,4.7,0.8
4,2,mandarin,mandarin,84.0,6.0,4.6,0.79


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fruit_label    60 non-null     int64  
 1   fruit_name     60 non-null     object 
 2   fruit_subtype  60 non-null     object 
 3   mass           51 non-null     float64
 4   width          60 non-null     float64
 5   height         51 non-null     float64
 6   color_score    60 non-null     float64
dtypes: float64(4), int64(1), object(2)
memory usage: 3.4+ KB


In [4]:
df_2 = df.drop(['fruit_name', 'fruit_subtype', 'fruit_label'], axis=1)  # Dropping categorical/text/object columns, axis=1 means columns
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mass         51 non-null     float64
 1   width        60 non-null     float64
 2   height       51 non-null     float64
 3   color_score  60 non-null     float64
dtypes: float64(4)
memory usage: 2.0 KB


In [5]:
df_2 = df_2.fillna(df_2.mean())  # Filling the missing values with mean value of that column
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mass         60 non-null     float64
 1   width        60 non-null     float64
 2   height       60 non-null     float64
 3   color_score  60 non-null     float64
dtypes: float64(4)
memory usage: 2.0 KB


In [6]:
# Calculate euclidean distance between two rows of pandas DataFrame
def euclidean_distance(row1, row2, columns):
    distance = 0.0
    
    for column in columns:  
        distance += (row1[column] - row2[column])**2
    
    return sqrt(distance)

# Find the closest centroid for this row and return a cluster number
def find_closest(train, row, centroids):
    closest = 0

    min_dist = euclidean_distance(row, centroids[closest], train.columns)  # Initializing min_dist with euclidean of row and first centroid
    for i, centroid in enumerate(centroids):    
        dist = euclidean_distance(row, centroid, train.columns)
        
        if dist < min_dist:  # If euclidean with this centroid is less 
            min_dist = dist
            closest = i+1   # We want to return a cluster number relevant to centroid, i+1 because we want cluster number to start from 1
            
    return closest   # We are returning cluster number here

# Update centroids 
def update_centroids(temp):
    avg = temp.groupby('cluster').mean()   # Calculating mean of features in same cluster
    new_centroids = []
    
    for i in range(0, len(avg)):
        new_centroids.append( avg.iloc[i] )  # average in same cluster would now become new centroid
    
    return new_centroids

# KMeans clustering: Stopping condition is max number of iterations
def KMeans(train, k, iterations=100):
    temp = train.copy()
    
    centroids = [train.iloc[rand.randrange(0, len(train))] for i in range(k)]  # Taking k random rows as initial centroids
    
    for _ in range(iterations):  # Iterate until max iterations reached
        
        temp['cluster'] = temp.apply(lambda row: find_closest(train, row, centroids), axis=1)   # Assigning cluster number to rows
        centroids = update_centroids(temp)  # updating centroids
    
    return temp


# Have centroids remain unchanged, if unchanged return True else False
def centroids_unchanged(old_centroids, current_centroids):
    if old_centroids is None:  # First iteration
        return False
    
    same_centroids = []
    for ind in range(len(old_centroids)):
        
        if old_centroids[ind].equals(current_centroids[ind]):  # If old centroid and updated centroid are same
            same_centroids.append(True)
        else:
            same_centroids.append(False)
    
    return all(same_centroids)

# KMeans clustering: Stopping condition is that centroids remain unchanged
def KMeans_2(train, k):
    temp = train.copy()
    
    old_centroids = None    
    centroids = [train.iloc[rand.randrange(0, len(train))] for i in range(k)]    # Taking k random rows as initial centroids
    
    while not centroids_unchanged(old_centroids, centroids):  # Iterate until centroids remain unchanged
        
        temp['cluster'] = temp.apply(lambda row: find_closest(train, row, centroids), axis=1)   # Assigning cluster number to rows
        old_centroids, centroids = centroids, update_centroids(temp)   
    
    return temp

In [7]:
kmeans1 = KMeans(df_2, 4, iterations=500)
kmeans2 = KMeans_2(df_2, 4)

In [10]:
for i in range(0, len(kmeans1)):
    print(df.iloc[i]['fruit_label'], int(kmeans1.iloc[i]['cluster']))
print()
for i in range(0, len(kmeans2)):
    print(df.iloc[i]['fruit_label'], int(kmeans2.iloc[i]['cluster']))

1 0
1 0
1 0
2 2
2 2
2 2
2 2
2 2
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
3 4
3 0
3 0
3 0
3 2
3 0
3 0
3 0
3 0
3 0
3 2
3 0
3 0
3 0
3 0
3 0
3 3
3 0
3 0
4 0
4 0
4 0
4 0
4 0
4 0
4 2
4 2
4 2
4 2
4 2
4 2
4 2
4 2
4 0
4 2

1 0
1 0
1 0
2 2
2 2
2 2
2 2
2 2
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
4 4
3 4
3 0
3 0
3 0
3 2
3 0
3 0
3 0
3 0
3 0
3 2
3 0
3 0
3 0
3 0
3 0
3 3
3 0
3 0
4 0
4 0
4 0
4 0
4 0
4 0
4 2
4 2
4 2
4 2
4 2
4 2
4 2
4 2
4 0
4 2
