<a href="https://colab.research.google.com/github/sahandv/science_science/blob/master/clustering_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!rm -rf 'science_science'
username = "sahandv"#@param {type:"string"}
# password = ""#@param {type:"string"} 
!git clone https://github.com/$username/science_science.git
!ls

Cloning into 'science_science'...
remote: Enumerating objects: 1002, done.[K
remote: Total 1002 (delta 0), reused 0 (delta 0), pack-reused 1002[K
Receiving objects: 100% (1002/1002), 102.02 MiB | 36.91 MiB/s, done.
Resolving deltas: 100% (595/595), done.
sample_data  science_science


In [2]:
# datapath = '/mnt/6016589416586D52/Users/z5204044/GoogleDrive/GoogleDrive/Data/' # Local
datapath = 'drive/My Drive/Data/' # Remote
from google.colab import drive
drive.mount('/content/drive/')
!pip install -r 'science_science/requirements.txt'
!pip install tensorflow
!pip install keras

Mounted at /content/drive/
Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 11.0MB/s 
Collecting netgraph
[?25l  Downloading https://files.pythonhosted.org/packages/86/ed/1e163a923cc58feab143656f2eefd69e5a1d2e323423f62c08b5100a4cbe/netgraph-3.1.8.tar.gz (40kB)
[K     |████████████████████████████████| 40kB 7.0MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis, netgraph
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=22265005e9936c4de76e362c8445fdbbc73a9a3f85c51774f433d10162ac469c
  Stored in directory: /root/.cache/pip/wheels/98/71/24

In [3]:
import sys
import time
import gc
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from random import randint

from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn import metrics
from sklearn.metrics.cluster import silhouette_score,homogeneity_score,adjusted_rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score,adjusted_mutual_info_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfTransformer , TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

from science_science.sciosci.assets import text_assets as ta
from science_science.DEC.DEC_keras import DEC_simple_run

# =============================================================================
# Load data and init
# =============================================================================
data_address =  datapath+"Corpus/KPRIS/embeddings/deflemm/Doc2Vec patent_wos corpus"
label_address =  datapath+"Corpus/KPRIS/labels"

vectors = pd.read_csv(data_address)
labels = pd.read_csv(label_address,names=['label'])
labels_f = pd.factorize(labels.label)
X = vectors.values
Y = labels_f[0]
n_clusters = 5

labels_task_1 = labels[(labels['label']=='car') | (labels['label']=='memory')]
vectors_task_1 = vectors.iloc[labels_task_1.index]
labels_task_1_f = pd.factorize(labels_task_1.label)
X_task_1 = vectors_task_1.values
Y_task_1 = labels_task_1_f[0]
n_clusters_task_1 = 2

results = pd.DataFrame([],columns=['Method','parameter','Silhouette','Homogeneity','NMI','AMI','ARI'])
# =============================================================================
# Evaluation method
# =============================================================================
def evaluate(X,Y,predicted_labels):
    
    df = pd.DataFrame(predicted_labels,columns=['label'])
    if len(df.groupby('label').groups)<2:
        return [0,0,0,0,0]
    
    return [silhouette_score(X, predicted_labels, metric='euclidean'),
                    homogeneity_score(Y, predicted_labels),
                    normalized_mutual_info_score(Y, predicted_labels),
                    adjusted_mutual_info_score(Y, predicted_labels),
                    adjusted_rand_score(Y, predicted_labels)]

In [None]:

# =============================================================================
# K-means
# =============================================================================
print('\n- k-means random -----------------------')
for fold in tqdm(range(20)):
    seed = randint(0,10**5)
    model = KMeans(n_clusters=n_clusters,n_init=20, init='random', random_state=seed).fit(X)
    predicted_labels = model.labels_
    tmp_results = ['k-means random','seed '+str(seed)]+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)
# =============================================================================
# K-means with init='k-means++'
# =============================================================================
print('\n- k-means++ -----------------------')
for fold in tqdm(range(20)):
    seed = randint(0,10**5)
    model = KMeans(n_clusters=n_clusters,n_init=20,init='k-means++', random_state=seed).fit(X)
    predicted_labels = model.labels_
    tmp_results = ['k-means++','seed '+str(seed)]+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)
# =============================================================================
# Agglomerative
# =============================================================================
print('\n- Agglomerative -----------------------')
for fold in tqdm(range(4)):
    model = AgglomerativeClustering(n_clusters=n_clusters,linkage='ward').fit(X)
    predicted_labels = model.labels_
    tmp_results = ['Agglomerative','ward']+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)
# =============================================================================
# DBSCAN
# =============================================================================
eps=0.000001
print('\n- DBSCAN -----------------------')
for fold in tqdm(range(19)):
    eps = eps+0.05
    model = DBSCAN(eps=eps, min_samples=10,n_jobs=15).fit(X)
    predicted_labels = model.labels_
    tmp_results = ['DBSCAN','eps '+str(eps)]+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)
# =============================================================================
# Deep no min_max_scaling
# =============================================================================
archs = [[500, 500, 2000, 10],[500, 1000, 2000, 10],[500, 1000, 1000, 10],
         [500, 500, 2000, 100],[500, 1000, 2000, 100],[500, 1000, 1000, 100],
         [100, 300, 600, 10],[300, 500, 2000, 10],[700, 1000, 2000, 10],
         [200, 500, 10],[500, 1000, 10],[1000, 2000, 10],
         [200, 500, 100],[500, 1000, 100],[1000, 2000, 100],
         [1000, 500, 10],[500, 200, 10],[200, 100, 10],
         [1000, 1000, 2000, 10],[1000, 1500, 2000, 10],[1000, 1500, 1000, 10],
         [1000, 1000, 2000,500, 10],[1000, 1500, 2000,500, 10],[1000, 1500, 1000, 500, 10],
         [500, 500, 2000, 500, 10],[500, 1000, 2000, 500, 10],[500, 1000, 1000, 500, 10]]
print('\n- DEC -----------------------')
for fold in tqdm(archs):
    predicted_labels = DEC_simple_run(X,minmax_scale_custom_data=False,n_clusters=5,architecture=fold,pretrain_epochs=300)
    tmp_results = ['DEC',str(fold)]+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)
# =============================================================================
# Deep with min_max_scaling
# =============================================================================
archs = [[500, 500, 2000, 10],[500, 1000, 2000, 10],[500, 1000, 1000, 10],
         [500, 500, 2000, 100],[500, 1000, 2000, 100],[500, 1000, 1000, 100],
         [100, 300, 600, 10],[300, 500, 2000, 10],[700, 1000, 2000, 10],
         [200, 500, 10],[500, 1000, 10],[1000, 2000, 10],
         [200, 500, 100],[500, 1000, 100],[1000, 2000, 100],
         [1000, 500, 10],[500, 200, 10],[200, 100, 10],
         [1000, 1000, 2000, 10],[1000, 1500, 2000, 10],[1000, 1500, 1000, 10],
         [1000, 1000, 2000,500, 10],[1000, 1500, 2000,500, 10],[1000, 1500, 1000, 500, 10],
         [500, 500, 2000, 500, 10],[500, 1000, 2000, 500, 10],[500, 1000, 1000, 500, 10]]
print('\n- DEC -----------------------')
for fold in tqdm(archs):
    predicted_labels = DEC_simple_run(X,minmax_scale_custom_data=True,n_clusters=5,architecture=fold,pretrain_epochs=300)
    tmp_results = ['DEC minmax scaler',str(fold)]+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)


In [None]:
# =============================================================================
# Save to disk
# =============================================================================
results_df = pd.DataFrame(results)
results_df.to_csv(data_address+' clustering results _ new',index=False)

Mini test

In [4]:
# =============================================================================
# Deep no min_max_scaling
# =============================================================================
archs = [[500, 1000,10],[500, 1500,10],[500, 500,10],[500, 1000,100],
         [200, 500, 500,10],[200, 500, 1000,10],[200, 500, 200,10],
         [100, 200,10],[200, 200,10],[200, 500, 200,10],]
print('\n- DEC -----------------------')
for fold in tqdm(archs):
    predicted_labels = DEC_simple_run(X,minmax_scale_custom_data=False,n_clusters=5,architecture=fold,pretrain_epochs=300)
    tmp_results = ['DEC',str(fold)]+evaluate(X,Y,predicted_labels)
    tmp_results = pd.Series(tmp_results, index = results.columns)
    results = results.append(tmp_results, ignore_index=True)
mean = results.mean(axis=0)
maxx = results.max(axis=0)
print(mean)
print(maxx)
# =============================================================================
# Save to disk
# =============================================================================
results_df = pd.DataFrame(results)
results_df.to_csv(data_address+' clustering results _ new',index=False)

  0%|          | 0/10 [00:00<?, ?it/s]


- DEC -----------------------
Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 500)               50500     
_________________________________________________________________
encoder_1 (Dense)            (None, 1000)              501000    
_________________________________________________________________
encoder_2 (Dense)            (None, 10)                10010     
_________________________________________________________________
decoder_2 (Dense)            (None, 1000)              11000     
_________________________________________________________________
decoder_1 (Dense)            (None, 500)               500500    
_________________________________________________________________
decoder_0 (Dense)            (Non

 10%|█         | 1/10 [02:54<26:09, 174.44s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 500)               50500     
_________________________________________________________________
encoder_1 (Dense)            (None, 1500)              751500    
_________________________________________________________________
encoder_2 (Dense)            (None, 10)                15010     
_________________________________________________________________
decoder_2 (Dense)            (None, 1500)              16500     
_________________________________________________________________
decoder_1 (Dense)            (None, 500)               750500    
_________________________________________________________________
decoder_0 (Dense)            (None, 100)               50100    

 20%|██        | 2/10 [05:35<22:42, 170.33s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 500)               50500     
_________________________________________________________________
encoder_1 (Dense)            (None, 500)               250500    
_________________________________________________________________
encoder_2 (Dense)            (None, 10)                5010      
_________________________________________________________________
decoder_2 (Dense)            (None, 500)               5500      
_________________________________________________________________
decoder_1 (Dense)            (None, 500)               250500    
_________________________________________________________________
decoder_0 (Dense)            (None, 100)               50100    

 30%|███       | 3/10 [08:12<19:24, 166.32s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 500)               50500     
_________________________________________________________________
encoder_1 (Dense)            (None, 1000)              501000    
_________________________________________________________________
encoder_2 (Dense)            (None, 100)               100100    
_________________________________________________________________
decoder_2 (Dense)            (None, 1000)              101000    
_________________________________________________________________
decoder_1 (Dense)            (None, 500)               500500    
_________________________________________________________________
decoder_0 (Dense)            (None, 100)               50100    

 40%|████      | 4/10 [11:29<17:33, 175.65s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 200)               20200     
_________________________________________________________________
encoder_1 (Dense)            (None, 500)               100500    
_________________________________________________________________
encoder_2 (Dense)            (None, 500)               250500    
_________________________________________________________________
encoder_3 (Dense)            (None, 10)                5010      
_________________________________________________________________
decoder_3 (Dense)            (None, 500)               5500      
_________________________________________________________________
decoder_2 (Dense)            (None, 500)               250500   

 50%|█████     | 5/10 [13:30<13:16, 159.34s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 200)               20200     
_________________________________________________________________
encoder_1 (Dense)            (None, 500)               100500    
_________________________________________________________________
encoder_2 (Dense)            (None, 1000)              501000    
_________________________________________________________________
encoder_3 (Dense)            (None, 10)                10010     
_________________________________________________________________
decoder_3 (Dense)            (None, 1000)              11000     
_________________________________________________________________
decoder_2 (Dense)            (None, 500)               500500   

 60%|██████    | 6/10 [15:38<09:58, 149.71s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 200)               20200     
_________________________________________________________________
encoder_1 (Dense)            (None, 500)               100500    
_________________________________________________________________
encoder_2 (Dense)            (None, 200)               100200    
_________________________________________________________________
encoder_3 (Dense)            (None, 10)                2010      
_________________________________________________________________
decoder_3 (Dense)            (None, 200)               2200      
_________________________________________________________________
decoder_2 (Dense)            (None, 500)               100500   

 70%|███████   | 7/10 [17:46<07:10, 143.45s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 100)               10100     
_________________________________________________________________
encoder_1 (Dense)            (None, 200)               20200     
_________________________________________________________________
encoder_2 (Dense)            (None, 10)                2010      
_________________________________________________________________
decoder_2 (Dense)            (None, 200)               2200      
_________________________________________________________________
decoder_1 (Dense)            (None, 100)               20100     
_________________________________________________________________
decoder_0 (Dense)            (None, 100)               10100    

 80%|████████  | 8/10 [19:57<04:39, 139.67s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 200)               20200     
_________________________________________________________________
encoder_1 (Dense)            (None, 200)               40200     
_________________________________________________________________
encoder_2 (Dense)            (None, 10)                2010      
_________________________________________________________________
decoder_2 (Dense)            (None, 200)               2200      
_________________________________________________________________
decoder_1 (Dense)            (None, 200)               40200     
_________________________________________________________________
decoder_0 (Dense)            (None, 100)               20100    

 90%|█████████ | 9/10 [22:12<02:18, 138.12s/it]

Model: "AE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 100)]             0         
_________________________________________________________________
encoder_0 (Dense)            (None, 200)               20200     
_________________________________________________________________
encoder_1 (Dense)            (None, 500)               100500    
_________________________________________________________________
encoder_2 (Dense)            (None, 200)               100200    
_________________________________________________________________
encoder_3 (Dense)            (None, 10)                2010      
_________________________________________________________________
decoder_3 (Dense)            (None, 200)               2200      
_________________________________________________________________
decoder_2 (Dense)            (None, 500)               100500   

100%|██████████| 10/10 [24:16<00:00, 145.68s/it]

Silhouette     0.021759
Homogeneity    0.641390
NMI            0.636840
AMI            0.636746
ARI            0.645999
dtype: float64
Method                    DEC
parameter      [500, 500, 10]
Silhouette          0.0283322
Homogeneity          0.705695
NMI                  0.701949
AMI                  0.701871
ARI                  0.719629
dtype: object



