In [61]:
# coding:utf-8 
import sys
import gensim
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
from sklearn.cluster import KMeans 
from gensim.parsing.preprocessing import strip_punctuation, remove_stopwords
TaggededDocument = gensim.models.doc2vec.TaggedDocument
from gensim.parsing.preprocessing import STOPWORDS
my_stop_words = STOPWORDS.union(set(['a','A', 'Survey']))

In [69]:
def get_dataset():
    titles = pd.read_csv('top100.csv')
    title_list = titles[['ACMid', 'paper_name']]
    clean_title_list = []
    for i in title_list['paper_name']:
        tmp = strip_punctuation(i)
        tmp = remove_stopwords(tmp)
        # tmp = tmp.split(' ')
        clean_title_list.append(tmp)
    x_train = []
    for i, text in enumerate(clean_title_list):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l-1] = word_list[l-1].strip()
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)
    return x_train
 
def train(x_train, size=200, epoch_num=1):
    model_dm = Doc2Vec(x_train,min_count=1, window = 3, size = size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
    model_dm.save('model_dm')
    return model_dm
 
def cluster(x_train):
    infered_vectors_list = []
    print("load doc2vec model...")
    model_dm = Doc2Vec.load("model_dm")
    print("load train vectors...")
    i = 0
    for text, label in x_train:
        vector = model_dm.infer_vector(text)
        infered_vectors_list.append(vector)
        i += 1
    
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.cluster import SpectralClustering
    sim_matrix = cosine_similarity(infered_vectors_list)
    # labels = SpectralClustering(n_clusters=5).fit_predict(sim_matrix)
    
    from sklearn import metrics
    # print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(X, y_pred)) 
    for index, gamma in enumerate((0.01,0.1,1, 10)):
        for index, k in enumerate((6,7,8,9,10)):
            y_pred = SpectralClustering(n_clusters=k, gamma=gamma).fit_predict(sim_matrix)
            print("Calinski-Harabasz Score with gamma=", gamma, "n_clusters=", k,"score:", metrics.calinski_harabaz_score(sim_matrix, y_pred))
    '''
    print("train kmean model...")
    kmean_model = KMeans(n_clusters=5)
    kmean_model.fit(infered_vectors_list)
    labels= kmean_model.predict(infered_vectors_list[0:100])
    cluster_centers = kmean_model.cluster_centers_

    with open("own_classify.txt", 'w') as wf:
        for i in range(100):
            string = ""
            text = x_train[i][0]
            for word in text:
                string = string + word + ' '
            string = string + '\t'
            string = string + str(labels[i])
            string = string + '\n'
            wf.write(string)
    return cluster_centers
    '''

if __name__ == '__main__':
    x_train = get_dataset()
    model_dm = train(x_train)
    cluster(x_train)



load doc2vec model...
load train vectors...




Calinski-Harabasz Score with gamma= 0.01 n_clusters= 6 score: 45.37566308616295
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 7 score: 49.94188474627233
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 8 score: 43.554411439009826
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 9 score: 31.34981890180145
Calinski-Harabasz Score with gamma= 0.01 n_clusters= 10 score: 20.116166562554525




Calinski-Harabasz Score with gamma= 0.1 n_clusters= 6 score: 44.66383645959434
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 7 score: 41.20214926617884
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 8 score: 42.665980785277526
Calinski-Harabasz Score with gamma= 0.1 n_clusters= 9 score: 37.999925053089754




Calinski-Harabasz Score with gamma= 0.1 n_clusters= 10 score: 34.66628763243217
Calinski-Harabasz Score with gamma= 1 n_clusters= 6 score: 44.791201850864326
Calinski-Harabasz Score with gamma= 1 n_clusters= 7 score: 43.228996101755165
Calinski-Harabasz Score with gamma= 1 n_clusters= 8 score: 41.042636351104285
Calinski-Harabasz Score with gamma= 1 n_clusters= 9 score: 38.32792453561996




Calinski-Harabasz Score with gamma= 1 n_clusters= 10 score: 26.94279771740298
Calinski-Harabasz Score with gamma= 10 n_clusters= 6 score: 44.791201850864326
Calinski-Harabasz Score with gamma= 10 n_clusters= 7 score: 41.71185464118535
Calinski-Harabasz Score with gamma= 10 n_clusters= 8 score: 45.201575526937965
Calinski-Harabasz Score with gamma= 10 n_clusters= 9 score: 37.806466736450474
Calinski-Harabasz Score with gamma= 10 n_clusters= 10 score: 31.428412405746183




In [58]:
cluster_centers

array([4, 1, 1, 3, 3, 3, 1, 2, 1, 1, 3, 1, 3, 1, 4, 1, 1, 3, 3, 1, 3, 3,
       3, 2, 1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 1, 4, 3, 4, 1, 1, 0, 1, 1, 1,
       1, 1, 2, 0, 0, 1, 1, 1, 1, 4, 1, 2, 1, 1, 1, 1, 1, 2, 1, 3, 3, 1,
       3, 2, 1, 1, 1, 1, 2, 3, 1, 1, 2, 0, 1, 1, 1, 4, 3, 4, 1, 1, 1, 4,
       1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [47]:
sim_matrix

array([[[[ 1.,  1.,  1., ...,  1., -1., -1.],
         [ 1.,  1.,  1., ...,  1., -1., -1.],
         [ 1.,  1.,  1., ...,  1., -1., -1.],
         ...,
         [ 1.,  1.,  1., ...,  1., -1., -1.],
         [-1., -1., -1., ..., -1.,  1.,  1.],
         [-1., -1., -1., ..., -1.,  1.,  1.]],

        [[ 1.,  1.,  1., ..., -1., -1., -1.],
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         ...,
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         [-1., -1., -1., ...,  1.,  1.,  1.],
         [-1., -1., -1., ...,  1.,  1.,  1.]],

        [[ 1.,  1.,  1., ..., -1., -1., -1.],
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         ...,
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         [-1., -1., -1., ...,  1.,  1.,  1.],
         [-1., -1., -1., ...,  1.,  1.,  1.]],

        ...,

        [[ 1.,  1.,  1., ..., -1., -1., -1.],
         [ 1.,  1.,  1., ..., -1., -1., -1.],
         [ 1.,  1.

In [63]:
clustered_result = pd.read_csv('own_classify.txt', sep='\t', header=None)
clustered_result.sort_values(1,inplace=True)
clustered_result.to_csv('title_clusters.csv', index=False)

In [35]:
clustered_result

Unnamed: 0,0,1
76,The Feet Human Computer Interaction A Survey F...,0
24,Is Multimedia Multisensorial A Review Mulsemed...,0
4,Trust Evaluation Cross Cloud Federation Survey...,0
73,A Survey Hypervisor Based Monitoring Approache...,0
46,Geomagnetism Smartphone Based Indoor Localizat...,0
...,...,...
55,A Survey Wireless Indoor Localization Device P...,9
66,droid Assessment Evaluation Android Applicatio...,9
58,Performance Security Improvements Tor A Survey,9
13,Recent Advancements Event Processing,9
