# Baseline for the Competing Clusterers Scenario

This notebook produces a spreadsheet, InternetResearchAgency_tweets.clustered.csv .  It takes an existing csv of tweets and adds columns for 12 clustering algorithms from skikit learn and NLTK.  For each algorithm, one column is added that has that algorithms clustering assignment, and another one shows the closeness of the tweet to the centroid of the cluster.  This similarity  is in the vector space that is made be the gensim neural network doc2vec algorithm.  Sorting the notebook on cluster than similarity from the centroid gives an impression of the quality of the cluster, those tweets closest to the centroid being the most representative of the cluster.


The notebook is useful for a baseline for the competing clusterers scenario of the Singularity Net Simulation.  The user can run the data through this notebook to see how the clusterers might normally perform, so as to have a context by which to judge the quality of the solutions that the simulation agents have created.  

In [3]:
#parameters
input_root = 'data/tweets'
output_root = 'InternetResearchAgency'

In [4]:
import numpy as np
import pandas as pd
raw_data_path = input_root + '.csv'
raw_data_path_clustered =  input_root + '_' + output_root + '.clustered.csv'

#raw_data = pd.read_csv(raw_data_path, encoding = "utf-8")
raw_data = pd.read_csv(raw_data_path, encoding = "ISO-8859-1")
print('Number of instances: ' + str(len(raw_data)))

Number of instances: 203451


In [12]:
raw_data.head()


Unnamed: 0,user_id,user_key,created_at,created_str,retweet_count,retweeted,favorite_count,text,tweet_id,source,hashtags,expanded_urls,posted,mentions,retweeted_status_id,in_reply_to_status_id
0,2532612000.0,kathiemrr,1488207000000.0,2017-02-27 14:54:00,,,,#ThingsDoneByMistake kissing auntie in the lips,8.362279e+17,,"[""ThingsDoneByMistake""]",[],POSTED,[],,
1,2531160000.0,traceyhappymom,1471273000000.0,2016-08-15 14:50:20,,,,RT @mc_derpin: #TheOlderWeGet the more pessimistic we are https://t.co/zS3jHZJl8P,7.651989e+17,,"[""TheOlderWeGet""]",[],POSTED,[],,
2,,evewebster373,1435701000000.0,2015-06-30 21:56:09,,,,RT @dmataconis: Ready To Feel Like A Failure? Joan Of Arc Was Only 19 When She Was Burned At The Stake http://t.co/S2j1IFm4y9,6.160023e+17,,[],[],POSTED,[],,
3,4840552000.0,blacktolive,1474013000000.0,2016-09-16 08:04:48,18.0,False,17.0,Amen! #blacklivesmatter https://t.co/wGffaOqgzl,7.766933e+17,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>","[""Blacklivesmatter""]",[],POSTED,[],,
4,1694026000.0,jacquelinisbest,1474228000000.0,2016-09-18 19:46:25,0.0,False,0.0,RT @NahBabyNah: Twitchy: Chuck Todd caught out there shilling for Hillary Clinton\r\nThe post BUSTED: Adam Baldwi... https://t.co/ay28pMpDw6 #â¦,7.775946e+17,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>","[""WakeUpAmerica""]","[""http://ln.is/twitchy.com/loriz-31/3yafU""]",POSTED,"[""nahbabynah""]",7.775915e+17,


In [13]:
import gensim
import re


def createTrainingSet(data, raw_data_path):
    #create the training set and add a unique id to the output file for each comment
    data['SingularityNetID'] = -1
    doc2VecTrain = []
    with open('stopwords.txt','r') as f:
        stopwords = {word.lower().strip() for word in f.readlines()}
    for i in range (len(data)):
        if data.loc[i,'text']:
            comment = data.loc[i,'text']
            try:
                comment = re.sub(r"\s?http\S*", "", comment)
            except:
                pass
            #strList =gensim.utils.simple_preprocess(str(comment))
            tokenList =gensim.utils.lemmatize(str(comment),stopwords=frozenset(stopwords))
            strList =[x.decode('utf-8') for x in tokenList]
            if strList:
                data.loc[i, "SingularityNetID"] = i
                taggedDoc = gensim.models.doc2vec.TaggedDocument(strList,[i])
                doc2VecTrain.append(taggedDoc)
                
    data.to_csv(raw_data_path_clustered)
    import random
    %time random.shuffle(doc2VecTrain)
    print (len(doc2VecTrain))
    for i in range(10):
        print (doc2VecTrain[i])
    #debug = open ("debug.txt","w")
    #debug.write(str(doc2VecTrain))  
    #debug.close()

    return(doc2VecTrain)

In [14]:
import pickle
from pathlib import Path

doc2Vec_pickle = output_root + "_doc2VecTrain.p"
my_file = Path(output_root + doc2Vec_pickle)
if my_file.exists():
    %time doc2VecTrain = pickle.load ( open (doc2Vec_pickle, "rb"))
else:
    %time doc2VecTrain = createTrainingSet(raw_data, raw_data_path)
    pickle.dump (doc2VecTrain, open (doc2Vec_pickle, "wb"))

CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 129 ms
202778
TaggedDocument(['rt/NN', 'orconservative/JJ', 'gregjkrieg/NN', 'obama/NN', 'christian/JJ', 'follow/VB', 'bible/JJ', 'everyone/NN', 'else/RB', 'read/VB'], [77062])
TaggedDocument(['livin/NN', 'easy/JJ', 'smart/JJ'], [179052])
TaggedDocument(['rt/NN', 'financement/NN', 'du/NN', 'fn/NN', 'de/JJ', 'saint/NN', 'just/RB', 'va/JJ', 'porter/NN', 'plainte/NN', 'contre/JJ', 'etat/NN'], [30478])
TaggedDocument(['obama/JJ', 'administration/NN', 'confirm/VB', 'double/JJ', 'digit/NN', 'insurance/NN', 'premium/NN', 'hike/NN'], [21384])
TaggedDocument(['liberal/NN', 'least/JJ', 'tran/NN', 'youth/NN', 'die/VB', 'suicide/NN', 'trumpâ/NN', 'win/VB', 'none/NN', 'report/VB', 'death/NN', 'confirmedâ/VB'], [81591])
TaggedDocument(['rt/NN', 'jc/NN', 'nothing/NN'], [91807])
TaggedDocument(['sharium/NN', 'law/NN', 'ban/VB', 'united/NN', 'states/NN', 'sheriff/NN', 'david/VB', 'clarke/NN'], [72158])
TaggedDocument(['merkel/NN', 'verstehet/N

In [15]:
from pathlib import Path
modelPath = './' + output_root + '_facism_model'
size = 200
min_count = 5
totalIters = 1000

if Path(modelPath).is_file():
    model = gensim.models.doc2vec.Doc2Vec.load(modelPath)
else:
    model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=min_count, iter=totalIters, dm=0)
    model.build_vocab(doc2VecTrain)
    print (model.corpus_count)
    print (model.iter)
    %time model.train(doc2VecTrain, total_examples=model.corpus_count, epochs=model.iter)
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    model.save(modelPath)
    model = gensim.models.doc2vec.Doc2Vec.load(modelPath)

In [16]:
print("len(model.wv.vocab)")
print(str(len(model.wv.vocab)))

len(model.wv.vocab)
27163


## Test the Semantic Space
In order to test the quality of the semantic space created by Doc2Vec, the following two cells may be run in sequence multiple times.  The first cell picks a random comment, and the second cell finds the five comments across the space that are the most similar.  A comment appearing as most similar to itself is a sign of a high quality semantic space. 

In [17]:
import random

randindex = random.randint(0,len (doc2VecTrain))
print(randindex)
realIndex = doc2VecTrain[randindex][1][0]
print("index:" + str(realIndex))
print (doc2VecTrain[randindex][0])
themeVector = model.infer_vector(doc2VecTrain[randindex][0])
mostSim =  model.docvecs.most_similar([themeVector], topn=5)
print(themeVector)

29209
index:4561
['bernie/NN', 'ready/JJ', 'invite/VB', 'many/JJ', 'refugee/NN', 'sufficient/JJ', 'make/VB', 'president/NN', 'demdebatemiami/NN']
[ 0.1944351  -0.03905831  0.02347476 -0.03977409  0.09997911 -0.05085656
 -0.0557252   0.14759946 -0.5069165   0.17577572  0.135265    0.283784
 -0.15946224 -0.15062726 -0.14205624 -0.3112278   0.16221574 -0.3910323
  0.13420735  0.41706243 -0.3207433   0.0245885   0.10386068 -0.19277473
 -0.3574071  -0.20579587 -0.15555997 -0.17086482  0.08112323 -0.10985532
 -0.21693078  0.03936861  0.00165979  0.03749555  0.16271776  0.18455416
  0.05773365 -0.2956466  -0.14464307 -0.32535958  0.05686299  0.0686315
 -0.03857032 -0.01076763  0.4270765   0.23020366  0.11758833 -0.22938876
 -0.3263684  -0.2894638   0.02797775 -0.08155999 -0.167802    0.25930944
  0.10285083  0.38919833  0.24527839  0.04784041  0.07479761 -0.24977522
  0.36882174 -0.37429723 -0.20537248  0.00811404 -0.16692631 -0.17888643
 -0.33774918 -0.18723671  0.12338962 -0.24120092  0.377

In [18]:
pd.options.display.max_colwidth = 10000
mostSim =  model.docvecs.most_similar([themeVector], topn=5)

v1 = model.docvecs[realIndex].reshape(size,)
v2s = []
for id,dist in mostSim[:5]:
    print ("index:" + str(id))
    print ('similarity: ' + str(dist))
    justsim = model.docvecs.similarity(realIndex,id)
    print ('just_similarity: ' + str(justsim))
    comment = raw_data.loc[id]['text']
    v2s.append(model.docvecs[id])
    print(comment)
print ('v1')
print (v1)
print ('np.shape(v1)')
print (np.shape(v1))
print ('v2s')
print (v2s)
print ('np.shape(v2s)')
print (np.shape(v2s))
vsim = model.wv.cosine_similarities(v1,v2s)
print ('vsim')
print (vsim)
print ('np.shape(vsim)')
print (np.shape(vsim))
#print (model.docvecs[id])

index:4561
similarity: 0.8208450078964233
just_similarity: 1.0
Bernie is ready to invite as many refugees as it would be sufficient to make him the president #DemDebateMiami
index:59611
similarity: 0.5527162551879883
just_similarity: 0.6269402504467396
Bernie Sanders: letâs make America socialist again #DemDebate #DemDebateMiami
index:159664
similarity: 0.5360962748527527
just_similarity: 0.5738347131573727
Are you ready?
index:79112
similarity: 0.535906195640564
just_similarity: 0.6332480444032913
They both are too old to be our president #DemDebateMiami
index:114610
similarity: 0.5307534337043762
just_similarity: 0.573454870361426
âI`m ready for selfieâ #WhatClintonWrites
v1
[ 0.32570013 -0.09336285  0.2437899  -0.33785182  0.24543191 -0.06085902
  0.15546627  0.10887211 -0.691765    0.2954488   0.02804246  0.43864205
 -0.12526427 -0.00530394 -0.22727738 -0.44537798  0.23981842 -0.4353826
 -0.19952579  0.5622258  -0.25527537 -0.10378052 -0.09796141  0.03189933
 -0.49690706 -0.3

In [19]:
import numpy as np
cmtId2Vec ={}
for doc_id in range(len(doc2VecTrain)):
    inferred_vector = model.infer_vector(doc2VecTrain[doc_id].words)
    if  not np.isnan(inferred_vector).any() and not np.isinf(inferred_vector).any(): 
        cmtId = doc2VecTrain[doc_id].tags
        cmtId2Vec[cmtId[0]] = inferred_vector


In [20]:
print (len(cmtId2Vec))

202778


In [21]:

#emojis = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])',flags=re.UNICODE)
#emojis

In [22]:
def write_each_char(string,out):
    for i in range(len(string)):
        try:
            out.write(string[i])
        except TypeError:
            pass
        except UnicodeEncodeError as err:
            print ("write failed for char: "+string[i] + str(err))
    

In [23]:
def printRandClosest(data, model, trainingSet, closestNcommentsFile, topn, n):
    import string
    translator = str.maketrans('', '', string.punctuation)
    
    import re
    try:
        # UCS-4
        emojis = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])', flags=re.UNICODE)
    except re.error:
        # UCS-2
        emojis = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])',flags=re.UNICODE)
        
    non_blank_whitespace = re.compile("[\t\n\r\f\v]")
    
    for i in range (n):
        randindex = random.randint(0,len (trainingSet))
        #print(randindex)
        #print("CMTid:" + str(doc2VecTrain[randindex][1][0]))
        #print (doc2VecTrain[randindex][0])
        try:
            words = trainingSet[randindex][0]
            tags = trainingSet[randindex][1]
            #for words, tags in trainingSet:
            cmtID = tags[0]
            comment = ' '.join(filter(None, words))
            closestNcommentsFile.write(comment + ",")
            themeVector = model.infer_vector(words)
            #print(themeVector)
            mostSim =  model.docvecs.most_similar([themeVector], topn=topn)
            count = 0
            for id,dist in mostSim[:topn]:
                closestNcommentsFile.write(str(dist) + ':')
                comment = data.loc[id]['text']

                try: 
                    if len(comment) > 0:
                        comment = str(comment)
                        comment = emojis.sub(" emoji ", comment)
                        comment = non_blank_whitespace.sub(" ", comment)
                        comment = comment.lower().translate(translator)

                        if count < topn-1:
                            #closestNcommentsFile.write(comment +",")
                            strng = comment+','
                            write_each_char(strng,closestNcommentsFile)
                        else:
                            #closestNcommentsFile.write(comment +"\n")
                            strng = comment+'\n'
                            write_each_char(strng,closestNcommentsFile)
                        count += 1
                except TypeError:
                    pass
                except UnicodeEncodeError as err:
                    print ("printing failed for : "+comment + str(err))
        except:
            print(randindex)

In [24]:
from pathlib import Path

def testNumIters(data, model, size, doc2VecTrain, baseIters, epochs,root, n=1000,topn = 10 ):
    for i in range (baseIters,epochs*baseIters,baseIters):   
        %time model.train(doc2VecTrain, total_examples=model.corpus_count, epochs = model.iter)
        fileName  = root + ".size" + str(size)+ ".iter" + str(i) + ".csv"
        closestNcommentsFile = open (fileName,"w")
        printRandClosest(data, model, doc2VecTrain, closestNcommentsFile, topn, n)
        closestNcommentsFile.close()
    
    
def testVectorSizeAndIters (data, doc2VecTrain, root, min_count = 10, baseIters= 100, epochs = 10):
    sizes = [25,50,100,150,200, 250,300]
    for size in sizes:
        model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=min_count, iter=baseIters, dm=0)
        try:
            model.build_vocab(doc2VecTrain)
        except:
            print( "model.build_vocab(doc2VecTrain) failed ")
        testNumIters(data, model, size, doc2VecTrain,baseIters, epochs, root)
        
randClosest = output_root + "_rand_closest"
#testVectorSizeAndIters(raw_data, doc2VecTrain, randClosest)   

In [27]:
def printClosest(model, trainingSet, closestNcommentsFile, topn):
    import string
    translator = str.maketrans('', '', string.punctuation)
    for words, tags in trainingSet:
        cmtID = tags[0]
        comment = ' '.join(filter(None, words))
        closestNcommentsFile.write(comment + ",")
        themeVector = model.infer_vector(words)
        mostSim =  model.docvecs.most_similar([themeVector], topn=topn)
        count = 0
        for id,dist in mostSim[:topn]:
            count += 1
            closestNcommentsFile.write(str(dist) + ':')
            comment = data.loc[data['CmtID'] == id]['Comment']
            if len(comment) > 0:
                comment = str(comment.item())
                comment = comment.lower().translate(translator)
                if count < topn:
                    closestNcommentsFile.write(comment +",")
                else:
                    closestNcommentsFile.write(comment +"\n")

In [30]:
from pathlib import Path
closestComments = output_root + "_closest_comments.csv"

my_file = Path(closestComments)
if not my_file.exists():
    topn = 10
    n=1000
    closestNcommentsFile = open (closestComments,"w")
    #printClosest(model, doc2VecTrain, closestNcommentsFile, topn) 
    %time printRandClosest(raw_data, model, doc2VecTrain, closestNcommentsFile, topn, n)
    closestNcommentsFile.close()

In [31]:
def createClusterAlgorithms( raw_data, cmtVectors):
    from collections import Counter
    import numpy as np
    import nltk
    import sklearn
    import datetime

    from sklearn.cluster import DBSCAN
    from sklearn.cluster import MiniBatchKMeans
    from sklearn import metrics
    from sklearn.datasets.samples_generator import make_blobs
    from sklearn.preprocessing import StandardScaler 
    from sklearn.cluster import Birch
    from sklearn import mixture
    from nltk.cluster.kmeans import KMeansClusterer
    from nltk.cluster.gaac import GAAClusterer
    from sklearn import cluster, datasets, mixture
    from sklearn.neighbors import kneighbors_graph

    X = StandardScaler().fit_transform(cmtVectors)


    params = {'quantile': .3,
            'eps': .3,
            'damping': .9,
            'preference': -200,
            'n_neighbors': 10,
            'n_clusters': 20}





    # ============
    # Create cluster objects
    # ============
    import time
    import metrics
    
    secs = time.clock()
     # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params['n_neighbors'], include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    secs = time.clock() -secs
    print ("Time connectivity" + str(secs) )

    secs = time.clock()
    dbscan = cluster.DBSCAN(eps=params['eps']).fit(X)
    clusterAlgLabelAssignmentsSD= dbscan.labels_.astype(np.int)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Dbscan" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSD, metric = 'cosine')
    print ("Silhouette Dbscan" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSD)  
    print ("Calinski Harabaz Dbscan" + str(calinski_harabaz))
    
    secs = time.clock()
    affinity_propagation = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference']).fit(X)
    clusterAlgLabelAssignmentsSAP= affinity_propagation.predict(X)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Affinity Propagation" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSAP, metric = 'cosine')
    print ("Silhouette Affinity Propagation" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSAP)  
    print ("Calinski Harabaz Affinity Propagation" + str(calinski_harabaz))
    

    secs = time.clock()
    average_linkage = sklearn.cluster.AgglomerativeClustering(linkage="average", 
        affinity="cosine",n_clusters=params['n_clusters'], connectivity=connectivity).fit(X)
    clusterAlgLabelAssignmentsSAG= average_linkage.labels_.astype(np.int)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Agglomerative" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSAP, metric = 'cosine')
    print ("Silhouette Agglomerative" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSAP)  
    print ("Calinski Harabaz Agglomerative" + str(calinski_harabaz))
    
    

    secs = time.clock()
    birch = sklearn.cluster.Birch(n_clusters=params['n_clusters']).fit(X)
    clusterAlgLabelAssignmentsSB= birch.predict(X)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Birch" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSB, metric = 'cosine')
    print ("Silhouette Birch" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSB)  
    print ("Calinski Harabaz Birch" + str(calinski_harabaz))
    
    

    secs = time.clock()
    clusterAlgSGN = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full').fit(X)
    clusterAlgLabelAssignmentsSGN= clusterAlgSGN.predict(X)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Gaussian Mixture Normalized" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSGN, metric = 'cosine')
    print ("Silhouette Gaussian Mixture Normalized" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSGN)  
    print ("Calinski Harabaz Gaussian Mixture Normalized" + str(calinski_harabaz))
    
    

    secs = time.clock()
    clusterAlgSG = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full').fit(cmtVectors)
    clusterAlgLabelAssignmentsSG= clusterAlgSG.predict(cmtVectors)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Gaussian Mixture" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSG, metric = 'cosine')
    print ("Silhouette Gaussian Mixture" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSG)  
    print ("Calinski Harabaz Gaussian Mixture" + str(calinski_harabaz))
    
    

    secs = time.clock()
    clusterAlgSKN = MiniBatchKMeans(n_clusters=params['n_clusters']).fit(X)
    clusterAlgLabelAssignmentsSKN= clusterAlgSKN.predict(X)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Mini Batch Kmeans Normalized" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSKN, metric = 'cosine')
    print ("Silhouette Mini Batch Kmeans Normalized" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSKN)  
    print ("Calinski Harabaz Mini Batch Kmeans Normalized" + str(calinski_harabaz))
    
    

    secs = time.clock()
    clusterAlgSK = MiniBatchKMeans(n_clusters=params['n_clusters']).fit(cmtVectors)
    clusterAlgLabelAssignmentsSK= clusterAlgSK.predict(cmtVectors)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Mini Batch Kmeans" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSK, metric = 'cosine')
    print ("Silhouette Mini Batch Kmeans" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSK)  
    print ("Calinski Harabaz Mini Batch Kmeans" + str(calinski_harabaz))
    
    
    
    secs = time.clock() 
     # estimate bandwidth for mean shift
    bandwidth = sklearn.cluster.estimate_bandwidth(X, quantile=params['quantile'])
    secs = time.clock() -secs
    print ("Time Mean Shift bandwidth" + str(secs) )
 
    secs = time.clock()   
   
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X)
    clusterAlgLabelAssignmentsSM= ms.predict(X)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Mean Shift" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSM, metric = 'cosine')
    print ("Silhouette Mean Shift" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSM)  
    print ("Calinski Harabaz Mean Shift" + str(calinski_harabaz))

    
    secs = time.clock()
    ward = sklearn.cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward',connectivity=connectivity).fit(X)
    clusterAlgLabelAssignmentsSW= ward.labels_.astype(np.int)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Agglomerative Ward" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSW, metric = 'cosine')
    print ("Silhouette Agglomerative Ward" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSW)  
    print ("Calinski Harabaz Agglomerative Ward" + str(calinski_harabaz))
  
    
    
    
    
    secs = time.clock()
    spectral = sklearn.cluster.SpectralClustering(
        n_clusters=params['n_clusters'], eigen_solver='arpack',
        affinity="cosine")
    try:
        clusterAlgLabelAssignmentsSS= None
        spectral = spectral.fit(X)
    except ValueError as e:
        print("error: "+ str(e))
        print(X)
    else:
        clusterAlgLabelAssignmentsSS= spectral.labels_.astype(np.int)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time Spectral" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsSS, metric = 'cosine')
    print ("Silhouette Spectral" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsSS)  
    print ("Calinski Harabaz Spectral" + str(calinski_harabaz))
    

    secs = time.clock()
    clusterAlgNK = KMeansClusterer(params['n_clusters'], distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True)
    clusterAlgLabelAssignmentsNK = clusterAlgNK.cluster(cmtVectors, assign_clusters=True)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time NLTK Kmeans" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsNK, metric = 'cosine')
    print ("Silhouette NLTK Kmeans" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsNK)  
    print ("Calinski Harabaz NLTK Kmeans" + str(calinski_harabaz))

    secs = time.clock()
    clusterAlgNG = GAAClusterer(num_clusters=params['n_clusters'], normalise=True, svd_dimensions=None)
    clusterAlgLabelAssignmentsNG = clusterAlgNG.cluster(cmtVectors, assign_clusters=True)
    print(datetime.datetime.utcnow())
    secs = time.clock() -secs
    print ("Time NLTK GAA" + str(secs) )
    silhouette = metrics.silhouette_score(X, clusterAlgLabelAssignmentsNG, metric = 'cosine')
    print ("Silhouette NLTK GAA" + str(silhouette))
    calinski_harabaz = metrics.calinski_harabaz_score(X, clusterAlgLabelAssignmentsNG)  
    print ("Calinski Harabaz NLTK GAA" + str(calinski_harabaz))
  
    algorithms = { 'SM': clusterAlgLabelAssignmentsSM, 
              'SW':clusterAlgLabelAssignmentsSW, 
              'SS':clusterAlgLabelAssignmentsSS, 
              'SD':clusterAlgLabelAssignmentsSD, 
              'SAG':clusterAlgLabelAssignmentsSAG, 
              'SAP':clusterAlgLabelAssignmentsSAP, 
              'SB':clusterAlgLabelAssignmentsSB, 
              'SG': clusterAlgLabelAssignmentsSG, 
              'SGN':clusterAlgLabelAssignmentsSGN, 
              'SK':clusterAlgLabelAssignmentsSK, 
              'SKN':clusterAlgLabelAssignmentsSKN, 
              'NK':clusterAlgLabelAssignmentsNK, 
              'NG':clusterAlgLabelAssignmentsNG
             }
    return (algorithms)


In [32]:
import pickle
from pathlib import Path

#limit the number to cluster because the order of alg is n^2 or more
n = 20000
m = min (len(raw_data),n)

print(m)

randIndicies = [random.randint(0,len (raw_data)) for i in range(m)]

#cmtIds = [raw_data.loc[i,'SingularityNetID'] for i in range(len(raw_data)) ]
cmtIds = [raw_data.loc[i,'SingularityNetID'] for i in randIndicies ]
cmtVectors = [np.array(cmtId2Vec[cmtId]) for cmtId in cmtIds if cmtId in cmtId2Vec]


algorithms_pickle = output_root + "_algorithms.p"
my_file = Path(algorithms_pickle)
if my_file.exists():
    %time algorithms = pickle.load ( open (algorithms_pickle, "rb"))
else:
    %time algorithms = createClusterAlgorithms(raw_data, cmtVectors)
    pickle.dump (algorithms, open (algorithms_pickle, "wb"))


20000
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 24 ms


In [33]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

  
allvectors = {}

for name, clusterLabels in algorithms.items(): 
    centroidName = 'Centroid'+name
    clusterName = 'Cluster'+name
    raw_data[clusterName]= -1
    raw_data[centroidName]= -1
    if clusterLabels is not None:  #and len(set(clusterLabels)) < max_clusters 

        counter = Counter()
        vectors = {}
        for i in range(len(clusterLabels)):
            cluster = clusterLabels[i]
            counter[cluster]+= 1
            raw_data.loc[raw_data["SingularityNetID"]==cmtIds[i], clusterName] = cluster
            if cluster not in vectors:
                vectors[cluster] = []
            vectors[cluster].append(cmtVectors[i])
        allvectors[name] = vectors
        
allCentroids = {} 
for name, vectors in allvectors.items(): 
    thisAlgCentroids = {}
    allCentroids[name] = thisAlgCentroids
    for cluster,clustervecs in vectors.items():
        
        clusterName = 'Cluster'+name
        clusterString = str(cluster)
        numMembers = len(clustervecs)
        centroid = np.mean (clustervecs, axis = 0) if numMembers >1 else clustervecs[0]
        #print ('centroid')
        #print (centroid)
        if centroid.size:
            thisAlgCentroids[cluster] = centroid
            
cluster2cmtId = {}
for name, clusterLabels in algorithms.items(): 
    if clusterLabels is not None:
        for cmtId,cluster in zip(cmtIds,clusterLabels):
            if cmtId > -1: 
                #make a list name->cluster->cmtIds in cluster
                if name not in cluster2cmtId:
                    cluster2cmtId[name] = {}
                if cluster not in cluster2cmtId[name]:
                    cluster2cmtId[name][cluster] = []
                cluster2cmtId[name][cluster].append(cmtId)
#print ('cluster2cmtId')
#print (cluster2cmtId)

cluster2similarities = {}                
for name, clusterDict in cluster2cmtId.items(): 
    for cluster, cmtList in clusterDict.items():
        #print('name')
        #print(name)
        #print('cluster')
        #print(cluster)
        #print ('cmtList')
        #print (cmtList)
        vecs = [model.docvecs[cmtId] for cmtId in cmtList]
        #print ('np.shape(vecs)')
        #print (np.shape(vecs))
        centroidName = 'Centroid'+name
        centroid = allCentroids[name][cluster]
        #print ('np.shape(centroid)')
        #print (np.shape(centroid))
        #make a list name->cluster->similarities in cluster
        if name not in cluster2similarities:
            cluster2similarities[name] = {}
        #print('centroid')
        #print(centroid)
        #print('vecs')
        #print(vecs)
        cluster2similarities[name][cluster] = model.wv.cosine_similarities(centroid,vecs)
        #print ('np.shape(cluster2similarities[name][cluster])')
        #print (np.shape(cluster2similarities[name][cluster]))
#print ('cluster2similarities')
#print (cluster2similarities)

for name, clusterDict in cluster2similarities.items(): 
    print("\n\nAlgorithm " + name)
    for cluster, simvecArray in clusterDict.items():
        #print ('np.shape(simvecArray)')
        #print (np.shape(simvecArray))
        #print ('np.shape(cluster2cmtId[name][cluster])')
        #print (np.shape(cluster2cmtId[name][cluster]))
        simTuples = list(zip(cluster2cmtId[name][cluster],simvecArray))
        simTuples.sort(reverse=True, key=lambda tup: tup[1])
        size = len(simTuples)
        top5 = simTuples[:5] 
        index = 0
        print ('\ncluster '+ str(cluster) + ' has '+ str(size)+ ' docs like:\n' )
        for cmtId,similarity in top5:
            comment = raw_data.loc[cmtId,['text']]
            index += 1
            print(str(index)+". similarity: " + str(similarity))
            print(comment)                       
             
        for cmtId,similarity in simTuples:                 
            centroidName = 'Centroid'+name
            #print ('cmtId')
            #print (cmtId)
            #print ('centroidName')
            #print (centroidName)
            #print ('similarity')
            #print (similarity)
            raw_data.loc[raw_data["SingularityNetID"]==cmtId, centroidName] = similarity
          
raw_data.to_csv(raw_data_path_clustered)          




Algorithm SM

cluster 0 has 19839 docs like:

1. similarity: 0.8008956
text    Almost...ð #funnysports https://t.co/vuUVNAJ7fD
Name: 20616, dtype: object
2. similarity: 0.8008956
text    Almost...ð #funnysports https://t.co/vuUVNAJ7fD
Name: 20616, dtype: object
3. similarity: 0.7976871
text    To me, you are perfect!
Name: 155521, dtype: object
4. similarity: 0.79274845
text    https://t.co/ITMZj5yGZj it must hurt!
Name: 143949, dtype: object
5. similarity: 0.791909
text    @ne14phish Nice one!
Name: 88103, dtype: object


Algorithm SW

cluster 1 has 329 docs like:

1. similarity: 0.7508233
text    #imvotingbecause I'm over corruption
Name: 49915, dtype: object
2. similarity: 0.72016823
text    Great! I liked it! http://t.co/pZxpxusKfA
Name: 59404, dtype: object
3. similarity: 0.71610045
text    RT @deray: what? https://t.co/btzrkGfxSX
Name: 171837, dtype: object
4. similarity: 0.70064354
text    RT @bon98021883: "Jeff Sessions" https://t.co/z7OpiBgfB1
Name: 180889, dtype: obje