# Gensim Basic

In [4]:
import gensim
from gensim import corpora


# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)
#> Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [5]:
dictionary.token2id

{'Saudis': 0,
 'The': 1,
 'a': 2,
 'acknowledge': 3,
 'are': 4,
 'preparing': 5,
 'report': 6,
 'that': 7,
 'will': 8,
 'Jamal': 9,
 "Khashoggi's": 10,
 'Saudi': 11,
 'an': 12,
 'death': 13,
 'journalist': 14,
 'of': 15,
 'result': 16,
 'the': 17,
 'was': 18,
 'intended': 19,
 'interrogation': 20,
 'lead': 21,
 'one': 22,
 'to': 23,
 'went': 24,
 'wrong,': 25,
 'Turkey,': 26,
 'abduction': 27,
 'according': 28,
 'from': 29,
 'his': 30,
 'sources.': 31,
 'two': 32}

In [6]:
documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)


# If you check now, the dictionary should have been updated with the new words (tokens).
print(dictionary)
#> Dictionary(45 unique tokens: ['Human', 'abc', 'applications', 'computer', 'for']...)

print(dictionary.token2id)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)
{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'graph': 33, 'in': 34, 'intersection': 35, 'paths': 36, 'trees': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'and': 41, 'minors': 42, 'ordering': 43, 'quasi': 44, 'well': 45, 'A': 46, 'survey': 47}


# Bag of word

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

# CountVectorizer(stop_words="english")
vect = CountVectorizer()
vect.fit(bards_words)


print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

bag_of_words = vect.transform(bards_words)
print("Features name:\n{}".format(vect.get_feature_names()))
print("Dense representation of bag_of_words:\n{}".format(bag_of_words.toarray()))
 

Vocabulary size: 13
Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}
Features name:
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [58]:

text =[
      "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library",
      "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)",
      "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)",
      "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)",
      "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)" 
     ]

model1 = CountVectorizer(text,stop_words=["all","in","the","is","and"])
result1_vector = model1.fit_transform(text)
print('result1_vector shape: {}'.format(result1_vector.shape))

model2 = CountVectorizer(text,stop_words="english")
result2_vector = model2.fit_transform(text)
print('result2_vector shape: {}'.format(result2_vector.shape))

# use proportion here. Ignore terms that occurred in less than 25% of the documents
#model3 = CountVectorizer(text,min_df=0.25)
# ignore terms that appeared in less than n documents (can be proportion or absolute counts)
model3 = CountVectorizer(text,min_df=2)
result3_vector = model3.fit_transform(text)
print('result3_vector shape: {}'.format(result3_vector.shape))

# ignore terms that appeared in more than n documents (can be proportion or absolute counts)
# use proportion here
model4 = CountVectorizer(text,max_df=0.50)
result4_vector = model4.fit_transform(text)
print('result4_vector shape: {}'.format(result4_vector.shape))


result1_vector shape: (5, 40)
result2_vector shape: (5, 24)
result3_vector shape: (5, 8)
result4_vector shape: (5, 35)


# TF-IDF

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
 
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

word = vectorizer.get_feature_names()
print(word)

print(X.toarray())
 

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
tfidf_weight = tfidf.toarray() 
print(tfidf_weight)


for i in range(len(tfidf_weight)):
    print("-------output {}-th document tf-idf weight------".format(i))
    for j in range(len(word)):
        print(word[j],tfidf_weight[i][j])


['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]
-------output 0-th document tf-idf weight------
and 0.0
document 0.4387767428592343
first 0.5419765697264572
is 0.4387767428592343
one 0.0
second 0.0
the 0.35872873824808993
third 0.0
this 0.4387767428592343
-------output 1-th document tf-idf weight------
and 0.0
document 0.2723014675233404
first 0.0
is 0.2723014675233404
one 0.0
second 0.8532257361452786
the 0.22262429232510395
third 0.0
this 0.2723014675233404
-------output 2-th document tf

# N-Gram

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
bards_words =["The fool doth think he is wise",
              "but the wise man knows himself to be a fool"]

vect1 = CountVectorizer(ngram_range=(1, 1)).fit(bards_words)
print("Vocabulary size: {}".format(len(vect1.vocabulary_)))
print("Vocabulary:\n{}".format(vect1.get_feature_names()))

vect2 = CountVectorizer(ngram_range=(2, 2)).fit(bards_words)
print("Vocabulary size: {}".format(len(vect2.vocabulary_)))
print("Vocabulary:\n{}".format(vect2.get_feature_names()))
print("Transformed data (dense):\n{}".format(vect2.transform(bards_words).toarray()))

Vocabulary size: 13
Vocabulary:
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
Vocabulary size: 14
Vocabulary:
['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']
Transformed data (dense):
[[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]]


In [13]:
#https://medium.com/%E6%89%8B%E5%AF%AB%E7%AD%86%E8%A8%98/%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E8%99%95%E7%90%86-%E4%BD%BF%E7%94%A8-n-gram-%E5%AF%A6%E7%8F%BE%E8%BC%B8%E5%85%A5%E6%96%87%E5%AD%97%E9%A0%90%E6%B8%AC-10ac622aab7a

from collections import Counter, namedtuple
import json
import re

DATASET_DIR = 'dataset/WebNews.json'
with open(DATASET_DIR, encoding = 'utf8') as f:
    dataset = json.load(f)
    
seg_list = list(map(lambda d: d['detailcontent'], dataset))
rule = re.compile(r"[^\u4e00-\u9fa5]")
seg_list = [rule.sub('', seg) for seg in seg_list]
print(seg_list[0])

大年初六桃園八德大溪參香祈福祈求台灣平安桃園建設大步向前桃園市長鄭文燦今日上午前往桃園區清水巖下午前往八德區廣行宮大溪區中庄福德宮永安宮內柵仁安宮溪洲福山巖慈聖宮龍山寺參香並發送桃園福御守福袋給大年初六走春參香的市民朋友鄭市長表示大年初六是清水祖師聖誕也是開工的日子祈求清水祖師庇佑台灣平安健康武漢肺炎疫情不要蔓延到台灣也祈求桃園建設持續大步向前祝福所有鄉親信眾鼠來運轉今年的願望都能努力打拚實現鄭市長也呼籲市府將以高標準進行武漢肺炎防疫工作請市民朋友勤加洗手戴口罩量體溫如需前往人潮較多的地方記得要做好清潔消毒工作此外應避免聽信網路謠言造成恐慌亦可透過衛福部疾病管制署的疾管家獲知最新防疫資訊保護自身及周遭親友的健康安全鄭市長在中庄福德宮表示市府致力推動中庄地區發展中庄不只有調整池攔河堰中庄運動公園即將動工大漢溪邊也將興建堤防及防汛道路市民朋友無論在交通或觀光休憩都將更加便利另外國道號增設大鶯豐德交流道可行性研究已獲得交通部審議通過並陸續辦理相關建設計畫府會也將攜手合作讓交流道順利推動完成今日包括立法委員趙正宇市議員朱珍瑤呂林小鳳李柏坊陳治文黃家齊蔡永芳桃園工策會總幹事陳家濬市府民政局副局長林香美警察局督察長吳坤旭桃園區長陳玉明八德區長邱瑞朝大溪區長陳嘉聰桃園果菜市場公司董事長邱素芬大嵙崁文教基金會執行長李世明清水巖主委邱顯來廣行宮主委李秀明中庄福德宮主委沈琳容永安宮主委林繼雄內柵仁安宮主委簡子嚴溪洲福山巖主委楊賴傳慈聖宮主委蔡水木龍山寺董事長陳有盛等均一同參香


In [14]:
def ngram(documents, N=2):
    ngram_prediction = dict()
    total_grams = list()
    words = list()
    Word = namedtuple('Word', ['word', 'prob'])

    for doc in documents:
        split_words = ['<s>'] + list(doc) + ['</s>']
        # 計算分子
        [total_grams.append(tuple(split_words[i:i+N])) for i in range(len(split_words)-N+1)]
        # 計算分母
        [words.append(tuple(split_words[i:i+N-1])) for i in range(len(split_words)-N+2)]
        
    total_word_counter = Counter(total_grams)
    word_counter = Counter(words)
    
    for key in total_word_counter:
        word = ''.join(key[:N-1])
        if word not in ngram_prediction:
            ngram_prediction.update({word: set()})
            
        next_word_prob = total_word_counter[key]/word_counter[key[:N-1]]
        w = Word(key[-1], '{:.3g}'.format(next_word_prob))
        ngram_prediction[word].add(w)
        
    return ngram_prediction

In [15]:
tri_prediction = ngram(seg_list, N=3)
print(tri_prediction)
for word, ng in tri_prediction.items():
    tri_prediction[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
text = '韓國'
next_words = list(tri_prediction[text])[:5]
for next_word in next_words:
    print('next word: {}, probability: {}'.format(next_word.word, next_word.prob))

next word: 隊, probability: 0.2
next word: 首, probability: 0.143
next word: 日, probability: 0.0571
next word: 及, probability: 0.0571
next word: 代, probability: 0.0571


# CBOW & Skip-gram  
yield explain: https://pyzh.readthedocs.io/en/latest/the-python-yield-keyword-explained.html#id8

In [17]:
import gzip
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

data_file="./dataset/reviews_data.txt.gz"

with gzip.open ('./dataset/reviews_data.txt.gz', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [18]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    print("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                print("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (data_file))
print("Done reading data file")

reading file ./dataset/reviews_data.txt.gz...this may take a while
read 0 reviews
read 10000 reviews
read 20000 reviews
read 30000 reviews
read 40000 reviews
read 50000 reviews
read 60000 reviews
read 70000 reviews
read 80000 reviews
read 90000 reviews
read 100000 reviews
read 110000 reviews
read 120000 reviews
read 130000 reviews
read 140000 reviews
read 150000 reviews
read 160000 reviews
read 170000 reviews
read 180000 reviews
read 190000 reviews
read 200000 reviews
read 210000 reviews
read 220000 reviews
read 230000 reviews
read 240000 reviews
read 250000 reviews
Done reading data file


In [19]:
documents[0]

['oct',
 'nice',
 'trendy',
 'hotel',
 'location',
 'not',
 'too',
 'bad',
 'stayed',
 'in',
 'this',
 'hotel',
 'for',
 'one',
 'night',
 'as',
 'this',
 'is',
 'fairly',
 'new',
 'place',
 'some',
 'of',
 'the',
 'taxi',
 'drivers',
 'did',
 'not',
 'know',
 'where',
 'it',
 'was',
 'and',
 'or',
 'did',
 'not',
 'want',
 'to',
 'drive',
 'there',
 'once',
 'have',
 'eventually',
 'arrived',
 'at',
 'the',
 'hotel',
 'was',
 'very',
 'pleasantly',
 'surprised',
 'with',
 'the',
 'decor',
 'of',
 'the',
 'lobby',
 'ground',
 'floor',
 'area',
 'it',
 'was',
 'very',
 'stylish',
 'and',
 'modern',
 'found',
 'the',
 'reception',
 'staff',
 'geeting',
 'me',
 'with',
 'aloha',
 'bit',
 'out',
 'of',
 'place',
 'but',
 'guess',
 'they',
 'are',
 'briefed',
 'to',
 'say',
 'that',
 'to',
 'keep',
 'up',
 'the',
 'coroporate',
 'image',
 'as',
 'have',
 'starwood',
 'preferred',
 'guest',
 'member',
 'was',
 'given',
 'small',
 'gift',
 'upon',
 'check',
 'in',
 'it',
 'was',
 'only',
 'co

In [20]:
'''
Word2Vec model parameters

size:
The size of the dense vector to represent each token or word. If you have very limited data, then size should be a much smaller value. If you have lots of data, its good to experiment with various sizes. A value of 100-150 has worked well for me.

window:
The maximum distance between the target word and its neighboring word. If your neighbor's position is greater than the maximum window width to the left and the right, then, some neighbors are not considered as being related to the target word. In theory, a smaller window should give you terms that are more related. If you have lots of data, then the window size should not matter too much, as long as its a decent sized window.

min_count:
Minimium frequency count of words. The model would ignore words that do not statisfy the min_count. Extremely infrequent words are usually unimportant, so its best to get rid of those. Unless your dataset is really tiny, this does not really affect the model.

workers:
How many threads to use behind the scenes?

sg: sg=1 means skip-gram and sg=0 menascbow
'''
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10, sg=0)
model.train(documents,total_examples=len(documents),epochs=10)

2020-01-31 09:25:19,451 : INFO : collecting all words and their counts
2020-01-31 09:25:19,455 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-31 09:25:19,891 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2020-01-31 09:25:20,276 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2020-01-31 09:25:20,741 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2020-01-31 09:25:21,130 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2020-01-31 09:25:21,667 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2020-01-31 09:25:22,075 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2020-01-31 09:25:22,496 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2020-01-31 09:25:22,789 : INFO : PROG

2020-01-31 09:26:26,973 : INFO : EPOCH 1 - PROGRESS: at 78.62% examples, 639974 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:26:27,986 : INFO : EPOCH 1 - PROGRESS: at 80.76% examples, 640196 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:26:28,996 : INFO : EPOCH 1 - PROGRESS: at 83.00% examples, 640610 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:26:29,993 : INFO : EPOCH 1 - PROGRESS: at 85.20% examples, 641991 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:26:31,012 : INFO : EPOCH 1 - PROGRESS: at 87.47% examples, 641734 words/s, in_qsize 17, out_qsize 4
2020-01-31 09:26:32,004 : INFO : EPOCH 1 - PROGRESS: at 89.91% examples, 642687 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:26:33,019 : INFO : EPOCH 1 - PROGRESS: at 92.19% examples, 642631 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:26:34,039 : INFO : EPOCH 1 - PROGRESS: at 94.38% examples, 642694 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:26:35,050 : INFO : EPOCH 1 - PROGRESS: at 96.49% examples, 641913 words/s,

2020-01-31 09:27:24,637 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-31 09:27:24,657 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-31 09:27:24,661 : INFO : EPOCH - 2 : training on 41519355 raw words (30350930 effective words) took 48.1s, 631173 effective words/s
2020-01-31 09:27:25,676 : INFO : EPOCH 3 - PROGRESS: at 2.04% examples, 638932 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:27:26,680 : INFO : EPOCH 3 - PROGRESS: at 3.75% examples, 577840 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:27:27,696 : INFO : EPOCH 3 - PROGRESS: at 5.82% examples, 593775 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:27:28,728 : INFO : EPOCH 3 - PROGRESS: at 7.54% examples, 575736 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:27:29,732 : INFO : EPOCH 3 - PROGRESS: at 8.99% examples, 553858 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:27:30,763 : INFO : EPOCH 3 - PROGRESS: at 10.28% examples, 540451 words/s, in_qsize 19, out_qsize 0

2020-01-31 09:28:26,744 : INFO : EPOCH 4 - PROGRESS: at 23.07% examples, 648108 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:28:27,761 : INFO : EPOCH 4 - PROGRESS: at 24.73% examples, 646360 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:28:28,833 : INFO : EPOCH 4 - PROGRESS: at 27.30% examples, 645760 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:28:29,873 : INFO : EPOCH 4 - PROGRESS: at 29.43% examples, 640464 words/s, in_qsize 20, out_qsize 1
2020-01-31 09:28:30,875 : INFO : EPOCH 4 - PROGRESS: at 32.03% examples, 644472 words/s, in_qsize 17, out_qsize 2
2020-01-31 09:28:31,884 : INFO : EPOCH 4 - PROGRESS: at 34.17% examples, 643884 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:28:32,915 : INFO : EPOCH 4 - PROGRESS: at 36.60% examples, 644809 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:28:33,949 : INFO : EPOCH 4 - PROGRESS: at 38.87% examples, 643789 words/s, in_qsize 20, out_qsize 1
2020-01-31 09:28:34,937 : INFO : EPOCH 4 - PROGRESS: at 41.29% examples, 645120 words/s,

2020-01-31 09:29:31,746 : INFO : EPOCH 5 - PROGRESS: at 69.12% examples, 673030 words/s, in_qsize 20, out_qsize 1
2020-01-31 09:29:32,759 : INFO : EPOCH 5 - PROGRESS: at 71.33% examples, 673171 words/s, in_qsize 17, out_qsize 2
2020-01-31 09:29:33,785 : INFO : EPOCH 5 - PROGRESS: at 73.85% examples, 674022 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:29:34,777 : INFO : EPOCH 5 - PROGRESS: at 76.05% examples, 674479 words/s, in_qsize 16, out_qsize 3
2020-01-31 09:29:35,794 : INFO : EPOCH 5 - PROGRESS: at 78.07% examples, 673690 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:29:36,819 : INFO : EPOCH 5 - PROGRESS: at 80.29% examples, 673231 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:29:37,852 : INFO : EPOCH 5 - PROGRESS: at 82.54% examples, 672718 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:29:38,851 : INFO : EPOCH 5 - PROGRESS: at 84.67% examples, 672587 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:29:39,872 : INFO : EPOCH 5 - PROGRESS: at 86.93% examples, 671732 words/s,

2020-01-31 09:30:29,874 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-01-31 09:30:29,878 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-01-31 09:30:29,882 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-01-31 09:30:29,886 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-31 09:30:29,894 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-31 09:30:29,898 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-31 09:30:29,910 : INFO : EPOCH 1 - PROGRESS: at 100.00% examples, 680126 words/s, in_qsize 0, out_qsize 1
2020-01-31 09:30:29,910 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-31 09:30:29,910 : INFO : EPOCH - 1 : training on 41519355 raw words (30350658 effective words) took 44.6s, 680079 effective words/s
2020-01-31 09:30:30,909 : INFO : EPOCH 2 - PROGRESS: at 2.17% examples, 684860 words/s, in_qsize 19, out_qsiz

2020-01-31 09:31:27,931 : INFO : EPOCH 3 - PROGRESS: at 22.35% examples, 619582 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:28,931 : INFO : EPOCH 3 - PROGRESS: at 23.90% examples, 618951 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:29,935 : INFO : EPOCH 3 - PROGRESS: at 25.82% examples, 617248 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:31:30,990 : INFO : EPOCH 3 - PROGRESS: at 28.40% examples, 619403 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:32,014 : INFO : EPOCH 3 - PROGRESS: at 30.79% examples, 621861 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:33,072 : INFO : EPOCH 3 - PROGRESS: at 33.32% examples, 624240 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:34,082 : INFO : EPOCH 3 - PROGRESS: at 35.74% examples, 629194 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:35,096 : INFO : EPOCH 3 - PROGRESS: at 38.11% examples, 630561 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:31:36,106 : INFO : EPOCH 3 - PROGRESS: at 40.49% examples, 632805 words/s,

2020-01-31 09:32:32,138 : INFO : EPOCH 4 - PROGRESS: at 67.98% examples, 681725 words/s, in_qsize 13, out_qsize 6
2020-01-31 09:32:33,155 : INFO : EPOCH 4 - PROGRESS: at 69.96% examples, 679521 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:32:34,157 : INFO : EPOCH 4 - PROGRESS: at 72.10% examples, 678520 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:32:35,194 : INFO : EPOCH 4 - PROGRESS: at 74.47% examples, 677535 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:32:36,196 : INFO : EPOCH 4 - PROGRESS: at 76.36% examples, 675557 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:32:37,196 : INFO : EPOCH 4 - PROGRESS: at 78.56% examples, 675833 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:32:38,200 : INFO : EPOCH 4 - PROGRESS: at 80.78% examples, 675949 words/s, in_qsize 17, out_qsize 2
2020-01-31 09:32:39,249 : INFO : EPOCH 4 - PROGRESS: at 83.03% examples, 675030 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:32:40,248 : INFO : EPOCH 4 - PROGRESS: at 85.22% examples, 675292 words/s,

2020-01-31 09:33:31,434 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-01-31 09:33:31,446 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-31 09:33:31,454 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-31 09:33:31,458 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-31 09:33:31,458 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-31 09:33:31,462 : INFO : EPOCH - 5 : training on 41519355 raw words (30346305 effective words) took 44.9s, 676045 effective words/s
2020-01-31 09:33:32,471 : INFO : EPOCH 6 - PROGRESS: at 2.09% examples, 654000 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:33:33,472 : INFO : EPOCH 6 - PROGRESS: at 4.27% examples, 655317 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:33:34,507 : INFO : EPOCH 6 - PROGRESS: at 6.60% examples, 669048 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:33:35,539 : INFO : EPOCH 6 - PROGRESS: at 8.81% exam

2020-01-31 09:34:32,320 : INFO : EPOCH 7 - PROGRESS: at 33.66% examples, 675713 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:34:33,350 : INFO : EPOCH 7 - PROGRESS: at 36.05% examples, 675797 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:34:34,342 : INFO : EPOCH 7 - PROGRESS: at 38.50% examples, 676408 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:34:35,355 : INFO : EPOCH 7 - PROGRESS: at 41.11% examples, 678562 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:34:36,366 : INFO : EPOCH 7 - PROGRESS: at 43.61% examples, 679366 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:34:37,415 : INFO : EPOCH 7 - PROGRESS: at 46.19% examples, 678109 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:34:38,429 : INFO : EPOCH 7 - PROGRESS: at 48.56% examples, 678845 words/s, in_qsize 19, out_qsize 2
2020-01-31 09:34:39,431 : INFO : EPOCH 7 - PROGRESS: at 51.04% examples, 679950 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:34:40,453 : INFO : EPOCH 7 - PROGRESS: at 53.15% examples, 678725 words/s,

2020-01-31 09:35:36,643 : INFO : EPOCH 8 - PROGRESS: at 79.02% examples, 675748 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:35:37,648 : INFO : EPOCH 8 - PROGRESS: at 81.27% examples, 675918 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:35:38,649 : INFO : EPOCH 8 - PROGRESS: at 83.54% examples, 675896 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:35:39,677 : INFO : EPOCH 8 - PROGRESS: at 85.65% examples, 675366 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:35:40,711 : INFO : EPOCH 8 - PROGRESS: at 88.35% examples, 676500 words/s, in_qsize 20, out_qsize 1
2020-01-31 09:35:41,702 : INFO : EPOCH 8 - PROGRESS: at 90.74% examples, 676565 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:35:42,750 : INFO : EPOCH 8 - PROGRESS: at 93.24% examples, 677517 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:35:43,755 : INFO : EPOCH 8 - PROGRESS: at 95.64% examples, 677495 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:35:44,763 : INFO : EPOCH 8 - PROGRESS: at 97.89% examples, 676927 words/s,

2020-01-31 09:36:32,584 : INFO : EPOCH 10 - PROGRESS: at 4.12% examples, 633354 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:36:33,618 : INFO : EPOCH 10 - PROGRESS: at 6.22% examples, 631241 words/s, in_qsize 17, out_qsize 2
2020-01-31 09:36:34,650 : INFO : EPOCH 10 - PROGRESS: at 8.33% examples, 633411 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:36:35,678 : INFO : EPOCH 10 - PROGRESS: at 10.09% examples, 633599 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:36:36,695 : INFO : EPOCH 10 - PROGRESS: at 11.93% examples, 640056 words/s, in_qsize 20, out_qsize 0
2020-01-31 09:36:37,690 : INFO : EPOCH 10 - PROGRESS: at 14.05% examples, 652835 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:36:38,711 : INFO : EPOCH 10 - PROGRESS: at 16.03% examples, 654194 words/s, in_qsize 19, out_qsize 0
2020-01-31 09:36:39,715 : INFO : EPOCH 10 - PROGRESS: at 17.79% examples, 654172 words/s, in_qsize 18, out_qsize 1
2020-01-31 09:36:40,740 : INFO : EPOCH 10 - PROGRESS: at 19.65% examples, 657071 wo

(303486803, 415193550)

In [21]:
w1 = "dirty"
model.wv.most_similar (positive=w1)

2020-01-31 09:37:28,222 : INFO : precomputing L2-norms of word weight vectors


[('filthy', 0.8649767637252808),
 ('unclean', 0.7816728949546814),
 ('stained', 0.7680723667144775),
 ('grubby', 0.7676006555557251),
 ('dusty', 0.7526466846466064),
 ('smelly', 0.7517306208610535),
 ('dingy', 0.7335317134857178),
 ('gross', 0.7160970568656921),
 ('disgusting', 0.7145689129829407),
 ('mouldy', 0.7115984559059143)]

In [22]:
# look up top 6 words similar to 'polite'
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

[('courteous', 0.9213926196098328),
 ('friendly', 0.8371918201446533),
 ('cordial', 0.8164287209510803),
 ('professional', 0.7834046483039856),
 ('curteous', 0.7721035480499268),
 ('attentive', 0.7692217826843262)]

In [23]:
# look up top 6 words similar to 'france'
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)

[('canada', 0.6766113042831421),
 ('germany', 0.6655616164207458),
 ('spain', 0.6482937335968018),
 ('mexico', 0.6131438612937927),
 ('hawaii', 0.611734926700592),
 ('england', 0.6073362827301025)]

In [24]:
# get everything related to stuff on the bed
w1 = ["bed",'sheet','pillow']
w2 = ['couch']
model.wv.most_similar (positive=w1,negative=w2,topn=10)

[('duvet', 0.720068633556366),
 ('blanket', 0.7177280187606812),
 ('mattress', 0.711969256401062),
 ('quilt', 0.6905918717384338),
 ('matress', 0.6876316070556641),
 ('pillows', 0.6443690657615662),
 ('pillowcase', 0.6401588916778564),
 ('sheets', 0.6301946043968201),
 ('comforter', 0.6158015727996826),
 ('foam', 0.6149501800537109)]

In [25]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="smelly")

0.7517306

In [26]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["cat","dog","france"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'france'

In [27]:
# print word vector
model.wv['dirty']

array([ 1.2328315 ,  2.3876312 , -3.145297  ,  2.4398775 , -2.596409  ,
       -2.5369923 , -0.3779383 ,  3.4048283 , -0.36356768, -1.6974839 ,
        0.21198772, -1.1418074 ,  1.4528807 , -0.67290956,  0.88678986,
       -2.4212637 ,  5.3189297 ,  1.0757921 , -2.4973161 ,  0.21883395,
       -2.4325886 , -1.4251771 ,  3.550189  ,  0.36202204,  3.1418197 ,
        1.9262061 ,  1.0360358 ,  2.9860694 , -1.8334814 ,  1.6458569 ,
        1.7575328 ,  2.3585575 , -3.864418  , -2.1340988 , -1.0610147 ,
       -1.7900633 ,  3.1246374 ,  0.31782976,  1.6041994 ,  0.49191818,
        1.6365526 , -2.7526255 ,  3.4192998 , -1.1165683 , -0.84559005,
        0.9656256 , -3.1016958 ,  0.8639137 , -0.59646815,  2.7678509 ,
        2.8266351 , -2.735863  ,  2.1288145 ,  1.3084606 , -1.9494946 ,
       -0.96154636, -0.5961025 , -2.9837945 ,  0.7618791 ,  2.1486118 ,
       -3.4807298 ,  0.33803144, -0.95271575,  2.6181896 ,  3.8802657 ,
       -0.8704985 ,  1.5160646 ,  0.15534614,  0.55781347, -1.93

# Glove  
ref: https://github.com/maciejkula/glove-python 

In [31]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

# download the model and return as object ready for use
model_glove_twitter = api.load("glove-twitter-25")

model_glove_twitter.wv.most_similar("pelosi",topn=10)

2020-01-31 11:51:01,086 : INFO : loading projection weights from C:\Users\isaac/gensim-data\glove-twitter-25\glove-twitter-25.gz
2020-01-31 11:52:13,857 : INFO : loaded (1193514, 25) matrix from C:\Users\isaac/gensim-data\glove-twitter-25\glove-twitter-25.gz
  import sys
2020-01-31 11:52:13,872 : INFO : precomputing L2-norms of word weight vectors


[('clegg', 0.9653650522232056),
 ('miliband', 0.9515050649642944),
 ('bachmann', 0.9484401345252991),
 ('mcconnell', 0.9416399002075195),
 ('carney', 0.9340257048606873),
 ('coulter', 0.9311323761940002),
 ('boehner', 0.9286301732063293),
 ('santorum', 0.9269058704376221),
 ('farage', 0.919365406036377),
 ('mourdock', 0.9186689853668213)]

In [32]:
model_glove_twitter.wv['dirty']

  """Entry point for launching an IPython kernel.


array([-4.0317e-01,  6.1409e-03,  3.8262e-01,  3.9230e-01,  5.8592e-01,
        6.5074e-01,  1.0542e+00, -6.0706e-01, -4.1578e-01,  2.8592e-01,
       -6.4108e-01,  4.4421e-01, -3.6479e+00, -1.0700e+00,  3.5746e-03,
        9.2253e-02,  6.6907e-01, -7.1866e-01, -2.7309e-02,  6.2390e-01,
        4.4713e-01,  8.7072e-01,  9.5165e-01, -1.0244e+00,  1.2813e-01],
      dtype=float32)

In [33]:
model_glove_twitter.wv.most_similar("policies",topn=10)

  """Entry point for launching an IPython kernel.


[('policy', 0.9484813213348389),
 ('reforms', 0.9403933882713318),
 ('laws', 0.94012051820755),
 ('government', 0.9230710864067078),
 ('regulations', 0.9168934226036072),
 ('economy', 0.9110006093978882),
 ('immigration', 0.9105910062789917),
 ('legislation', 0.9089650511741638),
 ('govt', 0.9054746627807617),
 ('regulation', 0.9050779342651367)]

In [34]:
model_glove_twitter.wv.doesnt_match(["trump","bernie","obama","pelosi","orange"])

  """Entry point for launching an IPython kernel.


'orange'

In [35]:
import gensim.downloader as api
#again, download and load the model
model_gigaword = api.load("glove-wiki-gigaword-100")



2020-01-31 11:54:51,687 : INFO : glove-wiki-gigaword-100 downloaded
2020-01-31 11:54:51,717 : INFO : loading projection weights from C:\Users\isaac/gensim-data\glove-wiki-gigaword-100\glove-wiki-gigaword-100.gz
2020-01-31 11:55:59,469 : INFO : loaded (400000, 100) matrix from C:\Users\isaac/gensim-data\glove-wiki-gigaword-100\glove-wiki-gigaword-100.gz


In [36]:
# find similarity
model_gigaword.wv.most_similar(positive=['dirty','grimy'],topn=10)

  
2020-01-31 11:57:21,180 : INFO : precomputing L2-norms of word weight vectors


[('filthy', 0.7690386772155762),
 ('smelly', 0.7392697334289551),
 ('shabby', 0.7025482058525085),
 ('dingy', 0.7022336721420288),
 ('grubby', 0.6754513382911682),
 ('grungy', 0.6414023041725159),
 ('dank', 0.626369833946228),
 ('sweaty', 0.622745156288147),
 ('dreary', 0.6216243505477905),
 ('gritty', 0.621574878692627)]

# Doc2Vect

In [1]:
#python example to train doc2vec model (with or without pre-trained word embeddings)

import gensim.models as g
import logging

#doc2vec parameters
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes

#pretrained word embeddings
pretrained_emb = "./dataset/toy_data/pretrained_word_embeddings.txt" #None if use without pretrained embeddings

#input corpus
train_corpus = "./dataset/toy_data/train_docs.txt"

#output model
saved_path = "./dataset/toy_data/model.bin"

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)
model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, iter=train_epoch)

#save model
model.save(saved_path)


2020-01-29 19:38:25,965 : INFO : collecting all words and their counts
2020-01-29 19:38:25,994 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-01-29 19:38:26,033 : INFO : collected 11097 word types and 1000 unique tags from a corpus of 1000 examples and 84408 words
2020-01-29 19:38:26,034 : INFO : Loading a fresh vocabulary
2020-01-29 19:38:26,063 : INFO : effective_min_count=1 retains 11097 unique words (100% of original 11097, drops 0)
2020-01-29 19:38:26,064 : INFO : effective_min_count=1 leaves 84408 word corpus (100% of original 84408, drops 0)
2020-01-29 19:38:26,141 : INFO : deleting the raw counts dictionary of 11097 items
2020-01-29 19:38:26,143 : INFO : sample=1e-05 downsamples 3599 most-common words
2020-01-29 19:38:26,145 : INFO : downsampling leaves estimated 22704 word corpus (26.9% of prior 84408)
2020-01-29 19:38:26,176 : INFO : estimated required memory for 11097 words and 300 dimensions: 33381300 bytes
2020-01-29 19:38:26,177 : INF

2020-01-29 19:38:53,556 : INFO : EPOCH - 29 : training on 84408 raw words (23552 effective words) took 0.9s, 27079 effective words/s
2020-01-29 19:38:54,391 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:38:54,392 : INFO : EPOCH - 30 : training on 84408 raw words (23625 effective words) took 0.8s, 28326 effective words/s
2020-01-29 19:38:55,193 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:38:55,193 : INFO : EPOCH - 31 : training on 84408 raw words (23690 effective words) took 0.8s, 29215 effective words/s
2020-01-29 19:38:55,984 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:38:55,985 : INFO : EPOCH - 32 : training on 84408 raw words (23772 effective words) took 0.8s, 30588 effective words/s
2020-01-29 19:38:56,773 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:38:56,775 : INFO : EPOCH - 33 : training on 84408 raw words (23742 effective words) took 0

2020-01-29 19:39:22,417 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:22,418 : INFO : EPOCH - 59 : training on 84408 raw words (23693 effective words) took 1.0s, 22852 effective words/s
2020-01-29 19:39:23,321 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:23,322 : INFO : EPOCH - 60 : training on 84408 raw words (23766 effective words) took 0.9s, 26339 effective words/s
2020-01-29 19:39:24,210 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:24,211 : INFO : EPOCH - 61 : training on 84408 raw words (23851 effective words) took 0.9s, 26917 effective words/s
2020-01-29 19:39:25,068 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:25,068 : INFO : EPOCH - 62 : training on 84408 raw words (23707 effective words) took 0.9s, 27426 effective words/s
2020-01-29 19:39:25,970 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19

2020-01-29 19:39:53,088 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:53,089 : INFO : EPOCH - 96 : training on 84408 raw words (23672 effective words) took 0.8s, 29428 effective words/s
2020-01-29 19:39:54,096 : INFO : EPOCH 97 - PROGRESS: at 82.60% examples, 19470 words/s, in_qsize 2, out_qsize 0
2020-01-29 19:39:54,258 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:54,260 : INFO : EPOCH - 97 : training on 84408 raw words (23762 effective words) took 1.2s, 20323 effective words/s
2020-01-29 19:39:55,062 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:55,064 : INFO : EPOCH - 98 : training on 84408 raw words (23675 effective words) took 0.8s, 29595 effective words/s
2020-01-29 19:39:56,033 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-29 19:39:56,034 : INFO : EPOCH - 99 : training on 84408 raw words (23771 effective words) took 1.0s, 24539 effective

In [3]:
#python example to infer document vectors from trained doc2vec model
import gensim.models as g
import codecs

#parameters
model="./dataset/toy_data/model.bin"
test_docs="./dataset/toy_data/test_docs.txt"
output_file="./dataset/toy_data/test_vectors.txt"

#inference hyper-parameters
start_alpha=0.01
infer_epoch=1000

#load model
m = g.Doc2Vec.load(model)
test_docs = [ x.strip().split() for x in codecs.open(test_docs, "r", "utf-8").readlines() ]

print('test docs:\n{}'.format(test_docs))
#infer test vectors
output = open(output_file, "w")
for d in test_docs:
    output.write( " ".join([str(x) for x in m.infer_vector(d, alpha=start_alpha, steps=infer_epoch)]) + "\n" )
output.flush()
output.close()


2020-01-29 19:42:55,761 : INFO : loading Doc2Vec object from ./dataset/toy_data/model.bin
2020-01-29 19:42:56,008 : INFO : loading vocabulary recursively from ./dataset/toy_data/model.bin.vocabulary.* with mmap=None
2020-01-29 19:42:56,008 : INFO : loading trainables recursively from ./dataset/toy_data/model.bin.trainables.* with mmap=None
2020-01-29 19:42:56,008 : INFO : loading wv recursively from ./dataset/toy_data/model.bin.wv.* with mmap=None
2020-01-29 19:42:56,008 : INFO : loading docvecs recursively from ./dataset/toy_data/model.bin.docvecs.* with mmap=None
2020-01-29 19:42:56,008 : INFO : loaded ./dataset/toy_data/model.bin


test docs:
[['the', 'cardigan', 'welsh', 'corgi', 'is', 'one', 'of', 'two', 'separate', 'dog', 'breeds', 'known', 'as', 'welsh', 'corgis', 'that', 'originated', 'in', 'wales', ',', 'the', 'other', 'being', 'the', 'pembroke', 'welsh', 'corgi', '.', 'it', 'is', 'one', 'of', 'the', 'oldest', 'herding', 'breeds', '.'], ['cardigan', 'welsh', 'corgis', 'can', 'be', 'extremely', 'loyal', 'family', 'dogs', '.', 'they', 'are', 'able', 'to', 'live', 'in', 'a', 'variety', 'of', 'settings', ',', 'from', 'apartments', 'to', 'farms', '.', 'for', 'their', 'size', ',', 'however', ',', 'they', 'need', 'a', 'surprising', 'amount', 'of', 'daily', 'physical', 'and', 'mental', 'stimulation', '.', 'cardigans', 'are', 'a', 'very', 'versatile', 'breed', 'and', 'a', 'wonderful', 'family', 'companion', '.'], ['pembrokes', 'and', 'cardigans', 'first', 'appeared', 'together', 'in', 'dog', 'shows', 'in', '1925', 'when', 'they', 'were', 'shown', 'under', 'the', 'rules', 'of', 'the', 'kennel', 'club', 'in', 'britain