In [1]:
import os
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import numpy as np
from nltk.tokenize import RegexpTokenizer
import nltk.data


In [12]:
model = gensim.models.Word2Vec(size=300, window=5, min_count=2, negative=20, workers=3)
sentences = gensim.models.word2vec.LineSentence("datasets\\MSFT_Earning_Calls\\all_msft_earning_calls_punct")
model.build_vocab(sentences)
# initalize with a pre-trained word2vec model
model.intersect_word2vec_format('datasets\word2vec\\GoogleNews-vectors-negative300.bin', binary=True)  # C binary format
model.train(sentences)
           

203965

In [13]:
#model.accuracy('datasets\\word2vec\\questions-words.txt')
#model.most_similar(positive=['Xbox', 'Dynamics'], negative=['CRM'])

#word clustering
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/10th of the vocabulary size, or an average of 10 words per cluster
word_vectors = model.syn0
num_clusters = word_vectors.shape[0]/10

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

Time taken for K Means clustering:  16.882999897 seconds.


In [14]:
# Create a Word / Index dictionary, mapping each vocabulary word to a cluster number                                                                                            
word_centroid_map = dict(zip( model.index2word, idx ))
# For the first 10 clusters
for cluster in xrange(0,10):
    print "\nCluster %d" % cluster    
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if( word_centroid_map.values()[i] == cluster ):
            words.append(word_centroid_map.keys()[i])
    print words  



Cluster 0
[u'equation']

Cluster 1
[u'little', u'plenty', u'lots', u'much', u'no', u'lack', u'many', u'lot', u'bit', u'ton', u'some', u'nothing', u'bunch']

Cluster 2
[u'above', u'reducing', u'shrink', u'reduced', u'moderate', u'low', u'peak', u'zero', u'lowering', u'reduce', u'higher', u'cut', u'lower', u'stabilized', u'high', u'measured', u'unchanged', u'lowered', u'below', u'normal']

Cluster 3
[u'principles', u'notion', u'theme', u'approach', u'logic', u'concept', u'values', u'views', u'formula', u'view', u'ideas', u'vision', u'style', u'philosophically', u'fundamental', u'framework', u'idea', u'belief', u'script', u'assumption', u'proposition', u'premise', u'structure', u'mindset', u'philosophy', u'fundamentally', u'culture']

Cluster 4
[u'Renewal', u'Home', u'Group', u'Book', u'Sway', u'Word', u'Tier', u'Band', u'Big', u'Translator', u'Quarter', u'V1', u'Mango', u'Fine', u'Crest', u'Personal', u'High', u'Bye', u'Tech', u'cetera', u'MOSKOWITZ', u'Best', u'Direction', u'Hub', u'Li

In [72]:
print model.most_similar(positive=['iOS', 'Dell'], negative=['Windows'])[0][0] # Dell - Windows + iOS = Apple
print model.most_similar(positive=['Windows', 'Apple'], negative=['iOS'])[0][0] # Apple - iOS + Windows = Microsoft
print model.most_similar(positive=['iPad', 'Microsoft'], negative=['Surface'])[0][0] # Microsoft - Surface + iPad = Apple
print model.most_similar(positive=['Windows', 'iPad'], negative=['iOS'])[0][0] # iPad - iOS + Windows = PCs
print model.most_similar(positive=['Android', 'Microsoft'], negative=['Windows'])[0][0] # Microsoft - Windows + Android = Google

Apple
Microsoft
Apple
PCs
Google


In [59]:
from gensim.models.doc2vec import TaggedLineDocument

sentences = TaggedLineDocument('datasets\\MSFT_Earning_Calls\\all_msft_earning_calls_punct') 
model = gensim.models.Doc2Vec(
          dm=1,        # Distributed memory model of Paragraph vector (PV-DM)
          size=300,    # dimensionality of the feature vectors
          dm_mean=1,   # Use mean of contex word vectors
          hs=0,        # No hierarchical sampling
          window=10,   # Max distance of the predicted word and context words used for prediction within a document.
          negative=20, # Negative sampling: how many “noise words” should be drawn 
          min_count=2, # ignore all words with total frequency lower than this. 
          workers=3    # worker threads to train the model
        )
model.build_vocab(sentences)
model.intersect_word2vec_format('datasets\word2vec\\GoogleNews-vectors-negative300.bin', binary=True)  # C binary format
model.train(sentences)

213652

In [73]:
doc_id = np.random.randint(model.docvecs.count)  # pick random doc, re-run cell for more examples

sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents

print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join([s for s in sentences if s[1][0]==doc_id][0][0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %d: «%s» [score = %.3f]\n' % (label, 
                                             sims[index][0], 
                                             ' '.join([s for s in sentences if s[1][0]==sims[index][0]][0][0]),
                                             sims[index][1]))
    
#print  model.infer_vector([s for s in sentences if s[1][0]==sims[index][0]][0][0]) # closest vector
print len(model.syn0)

TARGET (4268): «RICK SHERLUND , Nomura : On the Windows and Windows Live Division , there was no real guidance for the year .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n20,w10,mc2,t3):

MOST 5196: «In the Microsoft Business Division , we expect business revenue to account for approximately 85 % of the division 's total , while consumer revenue should account for the remaining 15 % .» [score = 0.941]

MEDIAN 5215: «Our OEM partners have started capitalizing on the new opportunities , delivering a wide range of new Windows hardware û from phones , to tablets , to new PCs .» [score = 0.845]

LEAST 5435: «With that backdrop , let 's move on to Windows .» [score = -0.843]

3957


In [74]:
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#model.accuracy('datasets\\word2vec\\questions-words.txt')
model.most_similar('Toyota')

[(u'Volkswagen', 0.5725679993629456),
 (u'Japan', 0.3600500524044037),
 (u'Samsung', 0.35791876912117004),
 (u'Japanese', 0.3435463607311249),
 (u'recall', 0.33637383580207825),
 (u'accelerator', 0.33497750759124756),
 (u'Dell', 0.3327538073062897),
 (u'Boeing', 0.31118008494377136),
 (u'Sprint', 0.30647531151771545),
 (u'hybrid', 0.30044567584991455)]

In [None]:
for doc_id in xrange(model.docvecs.count):
    sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
    
    if sims[0][1] > 0.9:
    
        print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join([s for s in sentences if s[1][0]==doc_id][0][0])))
        print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

        for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
            print(u'%s %d: «%s» [score = %.3f]\n' % (label, 
                                                     sims[index][0], 
                                                     ' '.join([s for s in sentences if s[1][0]==sims[index][0]][0][0]),
                                                     sims[index][1]))