In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import Birch
from sklearn.neighbors import NearestNeighbors

# PCA

In [12]:
# pca

def pca (data):
    pca=PCA(n_components=300)
    pca.fit(data)
    data_pca = pd.DataFrame(pca.transform(data))
    num_features = data.shape[1]

    #promena imena kolona skupu dobijenom nakon pca transformacije
    pca_columns = ['pca%d'%i for i in range(1, pca.n_components_+1)]
    data_pca.columns=pca_columns

    # procenat varijanse objasnjen svakom od izabranih glavnih komponenti
    print('explained_variance_ratio_  ')
    sum_ = 0
    for i, evr in zip(range(1, num_features+1), pca.explained_variance_ratio_):
        #_print("pca%d: %.10f"%(i,evr))
        sum_ += evr
    print()
    print("sum: %.2f"%(sum_))
    
    return data_pca


In [13]:
def tsne(data, file_name):
    mdl = TSNE(perplexity=30, n_iter=300)
    data_tsne = mdl.fit_transform(data)
    data_tsne = pd.DataFrame(data_tsne)
    data_tsne.columns = ['tsne_1', 'tsne_2']
#     data_tsne.insert(0, "Index", list(data["Index"]))

    plt.scatter(data_tsne['tsne_1'], data_tsne['tsne_2'], s=10, marker='o')
    plt.xlabel("tsne0")
    plt.ylabel("tsne1")
    plt.title("file")
    # plt.title("group 1")
    plt.tight_layout()
    pic_path = "./" + file_name + "_tsne_perp30.png"
    plt.savefig(pic_path, dpi=300)
    plt.show()
    
    return data_tsne


# Crtanje grafika

In [14]:
def grafik(data, data_pca_tsne, n_clusters):
    
    colors = ['darkcyan', 'magenta', 'gold', 'navy', 'orange']
    # pravljenje slike na graficima
    
    # oznaka celije na slici u kojoj se vrse naredna crtanja
    
    
    for j in range(0, n_clusters):
        # izdvajanje instanci klastera koji se obradjuje
        cluster= data_pca_tsne.loc[data_pca_tsne['label'] == j, :]
        
        # dodela oznake klasteru za legendu
        label = "klaster %d" % j
        # crtanje instanci klastera pomocu seme sa rasprsenim elementima
        plt.scatter(cluster['tsne_1'], cluster['tsne_2'], color=colors[j], s=10, marker='o', label=label)

        # postavljanje legende 
        plt.legend(loc='upper right',fontsize=6)
        

In [15]:
def knee(data, file_name):
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(data)
    distances, indices = nbrs.kneighbors(data)
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    distances = distances/max(distances)
    plt.plot(distances)
    pic_path = "./" + file_name + "_kneeChart.png"
    plt.savefig(pic_path, dpi=300)

# DBScan

In [21]:
# Funkcija koja pravi najbolji dbscan model

def dbscan(data, data_pca_tsne, eps, file_name):
    
    # trazimo parametre za najbolji model
    best_min_samples = 5
    best_silhouette_score = -2

    
    for min_samples in [2, 10, 30, 50, 100]:

            est=DBSCAN(eps=eps, min_samples=min_samples)
            est.fit(data)
                
            # pca skupu se dodaje atribut labels
            data_pca_tsne['label']= est.labels_
            print("min_samples: ", min_samples)
            print("labels: ", est.labels_)
            print("slusters: ", len(np.unique(est.labels_)))
            if(len(np.unique(est.labels_)) > 1):
                print("lables: ", len(np.unique(est.labels_)))
                silhouette = silhouette_score(data, est.labels_)
                print("silhouette score: ", silhouette)
                if(silhouette > best_silhouette_score):
                    best_silhouette_score = silhouette
                    best_min_samples = min_samples
                    
    # kada smo nasli parametre za najbolji model
    # primenjujemo takav model
    print("best_min_samples: ",best_min_samples)
    est=DBSCAN(eps=eps, min_samples=best_min_samples)
    est.fit(data)
    data_pca_tsne['label']= est.labels_
    num_clusters=max(est.labels_)+1

    
    colors = ['darkcyan', 'magenta', 'gold', 'blue', 'navy', 'green', 'red', 'orange', 'brown', 'yellow', 'violet']

    #obrada svakog klastera
    for j in range(-1,num_clusters):
        # izdvajanje instanci klastera koji se obradjuje
        cluster= data_pca_tsne.loc[data_pca_tsne['label'] == j, :]

        # dodela oznake klasteru za legendu
        if j==-1:
            label='sum'
        else:
            label = "klaster %d" % j

        # crtanje instanci klastera pomocu seme sa rasprsenim elementima
        plt.scatter(cluster['tsne_1'], cluster['tsne_2'], color=colors[j], s=10, marker='o', label=label)


    # postavljanje legende i naslova (koji sadrzi vrednost parametra eps i silueta koeficijent)
    # za svaku celiju
    plt.legend(loc='lower right',fontsize=6)
    
               
    if(len(np.unique(est.labels_)) > 1):
        plt.title('DBScan, eps:%.2f min_samples: %d, silhouette score: %.2f '%(eps, best_min_samples, silhouette_score(data, est.labels_)), fontsize=10)
    else:
        plt.title('DBSCAN, eps:%.2f, min_samples: %d '%(eps, best_min_samples), fontsize=10)
    
    pic_path = "./" + file_name + "_DBScan.png"
    plt.savefig(pic_path, dpi=300)

    # prikaz slike bez poklapanja celija
    plt.tight_layout()
    plt.show()
              
    return est.labels_


# KMeans

In [17]:
def kmeans(data, data_pca_tsne, file_name):
    
    # trazimo parametre za najbolji model
    best_init = ""
    best_n_clusters = 0
    best_silhouette_score = -1
    
    for n_clusters in [2,3,4,5]:
        for init in ['k-means++', 'random']:
            
            est=KMeans(n_clusters=n_clusters, init=init)
            est.fit(data)
                
            silhouette = silhouette_score(data, est.labels_)
            print("num of clusters: ", n_clusters)
            print("init: ", init)
            print("silhouette: ", silhouette)
            if silhouette > best_silhouette_score :
                best_init = init
                best_n_clusters = n_clusters
                best_silhouette_score = silhouette
            print("bestn_clusters: ", best_n_clusters)
            print("best init: ", best_init)
            print("--------------------------------------")
            
      
    # kada smo nasli parametre za najbolji model
    # primenjujemo takav model
    print("best n_clusters: ", best_n_clusters)
    print("best init: ", best_init)
    est = KMeans(n_clusters=best_n_clusters, init=best_init)
    est.fit(data)
    data_pca_tsne['label']= est.labels_



    grafik(data, data_pca_tsne, best_n_clusters)
    plt.title('KMeans, init:' + best_init +',  best_n_clusters: %d , silhouette score: %.2f'%(best_n_clusters, silhouette_score(data, data_pca_tsne['label'])), fontsize=10)
    # prikaz slike bez poklapanja celija
    plt.tight_layout()
    
    pic_path = "./" + file_name + "_KMeans.png"
    plt.savefig(pic_path, dpi=300)
    
    plt.show()
    
    return est.labels_ 
   
        

# Hijerarhijsko klasterovanje

In [18]:
def agglomerative(data, data_pca_tsne, file_name):
    
    # trazimo parametre za najbolji model
    best_n_clusters = 2
    best_silhouette_score = -1
    
    for n_clusters in [2,3,4,5]:
        for affinity in ['euclidean']:

            est=AgglomerativeClustering(n_clusters = n_clusters)
            est.fit(data)

            print("num of clusters: ", n_clusters)
            print("affinity: ", affinity)
            silhouette = silhouette_score(data, est.labels_)
            if(silhouette >best_silhouette_score):
                best_n_clusters = n_clusters
                best_silhouette_score = silhouette
            print("silhouette: ", silhouette)
            print("best n_clusters: ", best_n_clusters)
            print("-------------------------------------")

      
    # kada smo nasli parametre za najbolji model
    # primenjujemo takav model
    est = AgglomerativeClustering(n_clusters=best_n_clusters)
    est.fit(data)
    data_pca_tsne['label']= est.labels_
    
    grafik(data, data_pca_tsne, best_n_clusters)
    plt.title('Agglomeerative, silhouette score: %.2f'%(silhouette_score(data, data_pca_tsne['label'])), fontsize=10)
    # prikaz slike bez poklapanja celija
    plt.tight_layout()
    
    pic_path = "./" + file_name + "_Agglomerative.png"
    plt.savefig(pic_path, dpi=300)
    
    plt.show()
    
    return est.labels_

# Spectral

In [19]:
def spectral(data, data_pca_tsne, file_name):
    
    
    # trazimo parametre za najbolji model
    best_n_clusters = 0
    best_assign_labels = ""
    best_silhouette_score = -1
    
    for n_clusters in [2, 3, 4, 5]:
        for assign_labels in ['kmeans', 'discretize']:
            
            est=SpectralClustering(n_clusters = n_clusters, assign_labels=assign_labels)
            est.fit(data)
                
            print("num of clusters: ", n_clusters)
            print("assign labels: ", assign_labels)
            silhouette = silhouette_score(data, est.labels_)
            if(silhouette > best_silhouette_score):
                best_n_clusters = n_clusters
                best_assign_labels = assign_labels
                best_silhouette_score = silhouette
            print("silhouette: ", silhouette)
            print("best num of clusters: ", best_n_clusters)
            print("best assign labels: ", best_assign_labels)
            print("--------------------------------------")
                
      
    # kada smo nasli parametre za najbolji model
    # primenjujemo takav model
    est = SpectralClustering(n_clusters=best_n_clusters, assign_labels=best_assign_labels)
    est.fit(data)
    data_pca_tsne['label']= est.labels_

#   
       

    fig = grafik(data, data_pca_tsne, best_n_clusters)    
    plt.title('Spectral,silhouette score: %.2f'%(silhouette_score(data, data_pca_tsne['label'])), fontsize=10)
    # prikaz slike bez poklapanja celija
    plt.tight_layout()
    
    pic_path = "./" + file_name + "_Spectral.png"
    plt.savefig(pic_path, dpi=300)
    
    plt.show()
    
    return est.labels_
   

# Birch

In [20]:
def birch(data, data_pca_tsne, file_name):
    
    best_n_clusters = 0
    best_threshold = 0
    best_silhouette_score = -1
    
    for n_clusters in [2, 3, 4, 5]:
        for threshold in [0.1, 0.3, 0.5]:
            
            est=Birch(n_clusters=n_clusters, threshold=threshold)
            est.fit(data)
                
            print("num of clusters: ", n_clusters)
            print("threshold: ",  threshold)
            silhouette = silhouette_score(data, est.labels_)
            if(silhouette > best_silhouette_score):
                best_n_clusters = n_clusters
                best_threshold = threshold
                best_silhouette_score = silhouette
            print("silhouette: ", silhouette)
            print("best num of clusters: ", best_n_clusters)
            print("best threshold: ", best_threshold)
            print("--------------------------------------")
                
      
    # kada smo nasli parametre za najbolji model
    # primenjujemo takav model
    est = Birch(n_clusters=best_n_clusters, threshold=best_threshold)
    est.fit(data)
    data_pca_tsne['label']= est.labels_

#   
       

    fig = grafik(data, data_pca_tsne, best_n_clusters)    
    plt.title('Spectral,  threshold: %.1f,  silhouette score: %.2f '%( best_threshold, silhouette_score(data, data_pca_tsne['label'])), fontsize=10)
    # prikaz slike bez poklapanja celija
    plt.tight_layout()
    
    pic_path = "./" + file_name + "_Birch.png"
    plt.savefig(pic_path, dpi=300)
    
    plt.show()
    
    return est.labels_