In [None]:
import logging
import re
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
import hdbscan
import sklearn
from scipy.spatial.distance import cdist
from sklearn.preprocessing import normalize
import io
import numpy as np
import time

Links: 
- https://hdbscan.readthedocs.io/en/latest/ <br>
- https://towardsdatascience.com/lightning-talk-clustering-with-hdbscan-d47b83d1b03a <br>
- https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/ <br>

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def preprocess(str):
    # remove links
    str = re.sub(r'^https?:\/\/.*[\r\n]*', "", str)
    str = re.sub(r"\'s", " \'s", str)
    str = re.sub(r"\'ve", " \'ve", str)
    str = re.sub(r"n\'t", " n\'t", str)
    str = re.sub(r"\'re", " \'re", str)
    str = re.sub(r"\'d", " \'d", str)
    str = re.sub(r"\'ll", " \'ll", str)
    str = re.sub(r",", " , ", str)
    str = re.sub(r"!", " ! ", str)
    str = re.sub(r"\(", " ( ", str)
    str = re.sub(r"\)", " ) ", str)
    str = re.sub(r"\?", " ? ", str)
    str = re.sub(r"\s{2,}", " ", str)
    str = re.sub("(\r)+", "", str)
    str = re.sub("(\n)+", "", str)
    str = re.sub(r"^\s","",str)
    return str.lower()

In [None]:
class Documents(object):
    def __init__(self, documents):
        self.documents = documents

    def __iter__(self):
        for i, doc in enumerate(self.documents):
            yield TaggedDocument(words = doc, tags = [i])
file = r"some_file.csv"
corpus = open(file, "r", encoding="utf-8")
lines = corpus.read().split("\n")
count = len(lines)
preprocessed = []

In [None]:
duplicate_dict = {}
for t in lines:
    #if t not in duplicate_dict:
        #duplicate_dict[t] = True
    t = preprocess(t)
    fixed =''.join([x if x.isalnum() or x.isspace() else " " for x in t ]).split()
    preprocessed.append(fixed)

documents = Documents(preprocessed)

In [None]:
model = Doc2Vec(size=200, dbow_words=1, dm=0, window=12, seed=17, min_count=1, workers=4, iter=1000)

model.build_vocab(documents)

model.train(documents, total_examples=model.corpus_count, epochs=model.iter)
model.save(r'pvdm.model')

In [None]:
fname = r"pvdm.model"
model = Doc2Vec.load(fname)

In [None]:
model.docvecs.doctag_syn0.shape

In [None]:
#vector_array = np.asarray(vectors)
norm_data = normalize(model.docvecs.doctag_syn0, norm='l2')

In [None]:
#norm_data

start_time=time.time()
clusterer1 = hdbscan.HDBSCAN(min_cluster_size=8,min_samples=2, cluster_selection_method='leaf', core_dist_n_jobs=4)
db = clusterer1.fit(norm_data)

np.savetxt(r"NEW_labels_1.csv", clusterer1.labels_, delimiter=",")
#np.savetxt(r"NEW_labels_1.csv", clusterer1.probabilities_, delimiter=",")

print(time.time() - start_time)

In [None]:
label = list(clusterer1.labels_)
probab = list(clusterer1.probabilities_)
df1['ClusterID'] = pd.Series(label, index=df1.index)
df1['Probab'] = pd.Series(probab, index=df1.index)

In [None]:
df1.to_csv(r"Clustered_File.csv", encoding="ISO-8859-1")

In [None]:
freq = df1['ClusterID'].value_counts()
freq = freq.to_frame()
freq.columns = ['Freq']


In [None]:
freq.to_csv(r"Frequency_File.csv", encoding='utf-8', sep=",")

In [None]:
norm = pd.DataFrame(norm_data)

In [None]:
df4 = pd.concat([df1,norm], axis=1)

In [None]:
df4 = df4.drop(df4.index[len(df4)-1])
df4 = df4.groupby('ClusterID').first().reset_index()
df4 = df4.drop('Probab',1)

In [None]:
df4.to_csv(r"Level2.csv",encoding='utf-8', sep=",")

In [None]:
df5 = np.asarray(df4.ix[1:,2:203])

In [None]:
#start_time=time.time()
clusterer2 = hdbscan.HDBSCAN(min_cluster_size=2,min_samples=None, cluster_selection_method='eom', core_dist_n_jobs=4)
db2 = clusterer2.fit(df5)

In [None]:
df_new = df4.ix[1:,0:2]

In [None]:
label_2 = list(clusterer2.labels_)
probab_2 = list(clusterer2.probabilities_)
df_new['ClusterID_2'] = label_2
df_new['Probab'] = probab_2
df_new.to_csv(r"Level2_Clustered_File.csv", encoding="ISO-8859-1")
freq2 = df_new['ClusterID_2'].value_counts()
freq2 = freq2.to_frame()
freq2.columns = ['Freq']
freq2.to_csv(r"Level2_Frequency_File.csv", encoding='utf-8', sep=",")
print(time.time() - start_time)

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(df5)

In [None]:
import matplotlib
from sklearn.metrics import silhouette_score

In [None]:
%matplotlib inline

In [None]:
for n_cluster in range(2, 50):
    kmeans = KMeans(n_clusters=n_cluster).fit(df5)
    label = kmeans.labels_
    sil_coeff = silhouette_score(df5, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

In [None]:
kmeans = KMeans(n_clusters=10, max_iter=1000).fit(df5)

In [None]:
kmeans.labels_
np.savetxt(r"Kmeans.csv", kmeans.labels_, delimiter=",")