In [3]:
import pandas as pd
import numpy as np
import numpy.matlib
import math 
from numpy import linalg as LA
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from time import sleep
import sys

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import sklearn.cluster
import sklearn.metrics
import matplotlib.cm as cm
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading Datasets

In [None]:
toy_data = pd.read_csv('/content/drive/My Drive/IS/Final/ToyDataSet.csv').to_numpy()
bboxes_data = pd.read_csv('/content/drive/My Drive/IS/Final/bboxes.csv').to_numpy()

In [5]:
def normalize(x): # for normalizing datasets' features.
    y = (x - np.mean(x, axis = 0))/np.std(x, axis = 0)
    return y

In [6]:
toy_data = pd.read_csv('ToyDataSet.csv').to_numpy()
bboxes_data = pd.read_csv('bboxes.csv').to_numpy()

In [7]:
toy_data = normalize( toy_data )

In [8]:
norm_bboxes = np.zeros((bboxes_data.shape[0],2))
# normalizing BBoexs with given formula to reduce dimensions.
norm_bboxes[ :, 0 ] = (bboxes_data[:,4] - bboxes_data[:,2])/(bboxes_data[:,0])
norm_bboxes[ :, 1 ] = (bboxes_data[:,5] - bboxes_data[:,3])/(bboxes_data[:,1])
# Shuffling Dataset to have a well-distributed sampling.
np.random.shuffle( norm_bboxes )
np.random.shuffle( toy_data )

In [9]:
class Agglomerative:
    def __init__ ( self , n_clusters = 1 , affinity = 'euclidean' , linkage = 'single' ):
        self.n_clusters = n_clusters
        self.affinity = affinity
        self.linkage = linkage
        self.clusters = {}
        self.affinity_mat = 0
        self.avg_silhoutte_score = []
        self.assigned_labels_all = []

    def cal_affinity( self , data ): # caluclates distance of each pair of points in dataset.
        n = data.shape[ 0 ]
        self.affinity_mat = np.zeros(( n , n ))
        for first_data in range( n ):
            for second_data in range( first_data , n ):
                self.affinity_mat[ first_data , second_data ] = self.affinity_dist( data[ first_data , : ] , data[ second_data , : ] )
                self.affinity_mat[ second_data , first_data ] = self.affinity_mat[ first_data , second_data ]

    def affinity_dist( self , x , y ): # caclulates distance for 2 given points with 2 different method.
        if self.affinity == 'euclidean' : 
            return LA.norm( x - y )
        elif self.affinity == 'manhattan' : 
            return LA.norm( x - y , ord=1)

    def cluster_distance( self , a , b ): # calculates distace of 2 clusters based on diffenrent methods
        cluster_1 = self.clusters[ a ]     #  given as option in calss attributes.
        cluster_2 = self.clusters[ b ]
        if self.linkage == 'single':
            temp = []
            for data_1 in  cluster_1:
                for data_2 in cluster_2:
                    temp.append( self.affinity_mat[ data_1 , data_2 ] )
            return np.min( temp )
        elif self.linkage == 'complete':
            temp = []
            for data_1 in  cluster_1:
                for data_2 in cluster_2:
                    temp.append( self.affinity_mat[ data_1 , data_2 ] )
            return np.max( temp )
        elif self.linkage == 'average':
            temp = []
            for data_1 in  cluster_1:
                for data_2 in cluster_2:
                    temp.append( self.affinity_mat[ data_1 , data_2 ] )
            return np.mean( temp )
    
    def find_nearest_clusters( self ): # Findes 2-nearest clusters in each episode.
        nearest_clusters = ( -1 , -1 )
        min = np.inf
        n_clusters = len( self.clusters.keys() )
        for number_1 in range( n_clusters -1):
            for number_2 in range( number_1+1 , n_clusters ):
                temp_dist = self.cluster_distance( number_1 , number_2 )
                if temp_dist < min :
                    min = temp_dist
                    nearest_clusters = ( number_1 , number_2 )
        return nearest_clusters

    def combine_nearest_clusters( self , nearest ):
        first_cluster = nearest[ 0 ]
        second_cluster = nearest[ 1 ]
        last_cluster = len( self.clusters.keys() )
        combined = np.concatenate( (self.clusters[ first_cluster ] , self.clusters[ second_cluster ]) )
        self.clusters[ first_cluster ] = combined
        if second_cluster < last_cluster - 1 :
            for number in range( second_cluster , last_cluster-1 ):
                self.clusters[ number ] = self.clusters[ number + 1 ]
        del self.clusters[ last_cluster - 1 ]

    def assign_label( self , data ):
        self.predicted = -1*np.ones(data.shape[0])
        for i in self.clusters.keys():
            for j in self.clusters[ i ]:
                self.predicted[ j ] = i

    def average_silhoutte( self , data ): # calculates silhouette score for each set of clusters that made in diffenrent episodes in last 40 episodes.
        self.avg_silhoutte_score.append( sklearn.metrics.silhouette_score( data , self.predicted ) )

    def run( self , data , n = np.inf ):
        self.cal_affinity( data )
        size_data = data.shape[ 0 ]
        for i in range( size_data ):
            self.clusters[ i ] = np.array([ i ]) # setting all datas as different clusters at the begining of function call.
        itter = 0
        while len( self.clusters ) > self.n_clusters :
            if itter == n:
                break
            nearest_clusters = self.find_nearest_clusters()
            self.combine_nearest_clusters( nearest_clusters )
            # calculated how much of progress proceeds.
            sys.stdout.flush()
            sys.stdout.write('\r')
            percent = ((size_data - len( self.clusters ))*100)/(size_data - self.n_clusters )
            percent_5  = int(percent//5)
            sys.stdout.write("[%-20s] %.5f%%" % ('='*percent_5, percent))        

            itter += 1
            if len( self.clusters ) < 41 : 
                self.assign_label( data ) 
                self.average_silhoutte( data )
                self.assigned_labels_all.append( self.predicted )
        self.assigned_labels_all.reverse()
        self.avg_silhoutte_score.reverse()

In [10]:
agg = Agglomerative( n_clusters = 2 , affinity = 'euclidean' , linkage = 'single' )
d = toy_data[0:1000,:]
agg.run( d )
df = pd.DataFrame( agg.assigned_labels_all )
df.to_csv('Toy_1000_all_assiged_labels.csv')
!cp Toy_1000_all_assiged_labels.csv "drive/My Drive/IS/Final/"
df_1 = pd.DataFrame( agg.avg_silhoutte_score )
df_1.to_csv('Toy_1000_avg_silhoutte_score.csv')
!cp Toy_1000_avg_silhoutte_score.csv "drive/My Drive/IS/Final/"
df = pd.DataFrame( d )
df.to_csv('Toy_trained_data.csv')
!cp Toy_trained_data.csv "drive/My Drive/IS/Final/"

cp: cannot create regular file 'drive/My Drive/IS/Final/': No such file or directory
cp: cannot create regular file 'drive/My Drive/IS/Final/': No such file or directory


In [11]:
Toy_1000_all_assiged_labels = pd.read_csv('/content/drive/My Drive/IS/Final/Toy_1000_all_assiged_labels.csv').to_numpy()
Toy_1000_avg_silhoutte_score = pd.read_csv('/content/drive/My Drive/IS/Final/Toy_1000_avg_silhoutte_score.csv').to_numpy()
Toy_trained_data = pd.read_csv('/content/drive/My Drive/IS/Final/Toy_trained_data.csv').to_numpy()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/IS/Final/Toy_1000_all_assiged_labels.csv'

In [None]:
spec_silhouette_avgs = []
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(1, len(Toy_1000_avg_silhoutte_score[:,1])+1), Toy_1000_avg_silhoutte_score[:,1], '-o', alpha = 0.5, color = 'mediumpurple')
ax.set(title='Silhouette Score Agglomerative Clustering in Toy Dataset', ylabel = 'score', xlabel='k');
ax.axvline(x = np.argmax(Toy_1000_avg_silhoutte_score[:,1]) + 1, color = 'mediumpurple', label = 'zero eigenvalues', linestyle  ='--')
ax.set_xticks(range(1, len(Toy_1000_avg_silhoutte_score[:,1])+1))
plt.show()

In [None]:
Toy_trained_data[:,0]

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
colors = cm.Purples(Toy_1000_all_assiged_labels[9,1:].astype(float) / np.unique(Toy_1000_all_assiged_labels[9,1:]).shape[0] + 0.5 )
ax.scatter(Toy_trained_data[:, 1], Toy_trained_data[:, 2], marker = '.', s = 30, lw = 0, alpha = 0.7, c = colors)
centers = [np.mean(Toy_trained_data[:,1:3][Toy_1000_all_assiged_labels[9,1:] == i], axis = 0) for i in range(np.unique(Toy_1000_all_assiged_labels[9,1:]).shape[0])]
for i, c in enumerate(centers): ax.scatter(c[0], c[1],  marker='o', c="white", alpha = 1, s = 200, edgecolor='k')
for i, c in enumerate(centers): ax.scatter(c[0], c[1], marker='$%d$' % i, alpha = 1, s =50, edgecolor='k')
ax.set_title("The visualization of the clustered data(from scratch)")
ax.set_xlabel("Feature space for the 1st feature")
ax.set_ylabel("Feature space for the 2nd feature")
plt.show()

In [None]:
np.random.shuffle( norm_bboxes )
agg_2 = Agglomerative( n_clusters = 2 , affinity = 'euclidean' , linkage = 'single' )
d_2 = norm_bboxes[0:1000,:]
agg_2.run( d_2 )
df = pd.DataFrame( agg_2.assigned_labels_all )
df.to_csv('BBoxes_1000_all_assiged_labels.csv')
!cp BBoxes_1000_all_assiged_labels.csv "drive/My Drive/IS/Final/"
df_1 = pd.DataFrame( agg_2.avg_silhoutte_score )
df_1.to_csv('BBoxes_1000_avg_silhoutte_score.csv')
!cp BBoxes_1000_avg_silhoutte_score.csv "drive/My Drive/IS/Final/"
df_3 = pd.DataFrame( d_2 )
df_3.to_csv('BBoxes_trained_data.csv')
!cp BBoxes_trained_data.csv "drive/My Drive/IS/Final/"

In [None]:
BBoxes_1000_all_assiged_labels = pd.read_csv('/content/drive/My Drive/IS/Final/BBoxes_1000_all_assiged_labels.csv').to_numpy()
BBoxes_1000_avg_silhoutte_score = pd.read_csv('/content/drive/My Drive/IS/Final/BBoxes_1000_avg_silhoutte_score.csv').to_numpy()
BBoxes_trained_data = pd.read_csv('/content/drive/My Drive/IS/Final/BBoxes_trained_data.csv').to_numpy()

In [None]:
spec_silhouette_avgs = []
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(2, len(BBoxes_1000_avg_silhoutte_score[:,1])+2), BBoxes_1000_avg_silhoutte_score[:,1], '-o', alpha = 0.5, color = 'mediumpurple')
ax.set(title='Silhouette Score Agglomerative Clustering in Toy Dataset', ylabel = 'score', xlabel='k');
ax.axvline(x = np.argmax(BBoxes_1000_avg_silhoutte_score[:,1]) + 2, color = 'mediumpurple', label = 'zero eigenvalues', linestyle  ='--')
ax.set_xticks(range(2, len(BBoxes_1000_avg_silhoutte_score[:,1])+2))
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
colors = cm.Purples(BBoxes_1000_all_assiged_labels[4,1:].astype(float) / np.unique(BBoxes_1000_all_assiged_labels[4,1:]).shape[0] + 0.5 )
ax.scatter(BBoxes_trained_data[:, 1], BBoxes_trained_data[:, 2], marker = '.', s = 30, lw = 0, alpha = 0.7, c = colors)
centers = [np.mean(BBoxes_trained_data[:,1:3][BBoxes_1000_all_assiged_labels[4,1:] == i], axis = 0) for i in range(np.unique(BBoxes_1000_all_assiged_labels[4,1:]).shape[0])]
for i, c in enumerate(centers): ax.scatter(c[0], c[1],  marker='o', c="white", alpha = 1, s = 200, edgecolor='k')
for i, c in enumerate(centers): ax.scatter(c[0], c[1], marker='$%d$' % i, alpha = 1, s =50, edgecolor='k')
ax.set_title("The visualization of the clustered data(from scratch)")
ax.set_xlabel("Feature space for the 1st feature")
ax.set_ylabel("Feature space for the 2nd feature")
plt.show()