# Gensim Basic

In [1]:
import gensim
from gensim import corpora


# How to create a dictionary from a list of sentences?
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)
#> Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [2]:
dictionary.token2id

{'Saudis': 0,
 'The': 1,
 'a': 2,
 'acknowledge': 3,
 'are': 4,
 'preparing': 5,
 'report': 6,
 'that': 7,
 'will': 8,
 'Jamal': 9,
 "Khashoggi's": 10,
 'Saudi': 11,
 'an': 12,
 'death': 13,
 'journalist': 14,
 'of': 15,
 'result': 16,
 'the': 17,
 'was': 18,
 'intended': 19,
 'interrogation': 20,
 'lead': 21,
 'one': 22,
 'to': 23,
 'went': 24,
 'wrong,': 25,
 'Turkey,': 26,
 'abduction': 27,
 'according': 28,
 'from': 29,
 'his': 30,
 'sources.': 31,
 'two': 32}

In [3]:
documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)


# If you check now, the dictionary should have been updated with the new words (tokens).
print(dictionary)
#> Dictionary(45 unique tokens: ['Human', 'abc', 'applications', 'computer', 'for']...)

print(dictionary.token2id)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)
{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'graph': 33, 'in': 34, 'intersection': 35, 'paths': 36, 'trees': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'and': 41, 'minors': 42, 'ordering': 43, 'quasi': 44, 'well': 45, 'A': 46, 'survey': 47}


# Bag of word

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

# CountVectorizer(stop_words="english")
vect = CountVectorizer()
vect.fit(bards_words)


print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

bag_of_words = vect.transform(bards_words)
print("Features name:\n{}".format(vect.get_feature_names()))
print("Dense representation of bag_of_words:\n{}".format(bag_of_words.toarray()))
 

Vocabulary size: 13
Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}
Features name:
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [5]:

text =[
      "One Cent, Two Cents, Old Cent, New Cent: All About Money (Cat in the Hat's Learning Library",
      "Inside Your Outside: All About the Human Body (Cat in the Hat's Learning Library)",
      "Oh, The Things You Can Do That Are Good for You: All About Staying Healthy (Cat in the Hat's Learning Library)",
      "On Beyond Bugs: All About Insects (Cat in the Hat's Learning Library)",
      "There's No Place Like Space: All About Our Solar System (Cat in the Hat's Learning Library)" 
     ]

model1 = CountVectorizer(text,stop_words=["all","in","the","is","and"])
result1_vector = model1.fit_transform(text)
print('result1_vector shape: {}'.format(result1_vector.shape))

model2 = CountVectorizer(text,stop_words="english")
result2_vector = model2.fit_transform(text)
print('result2_vector shape: {}'.format(result2_vector.shape))

# use proportion here. Ignore terms that occurred in less than 25% of the documents
#model3 = CountVectorizer(text,min_df=0.25)
# ignore terms that appeared in less than n documents (can be proportion or absolute counts)
model3 = CountVectorizer(text,min_df=2)
result3_vector = model3.fit_transform(text)
print('result3_vector shape: {}'.format(result3_vector.shape))

# ignore terms that appeared in more than n documents (can be proportion or absolute counts)
# use proportion here
model4 = CountVectorizer(text,max_df=0.50)
result4_vector = model4.fit_transform(text)
print('result4_vector shape: {}'.format(result4_vector.shape))


result1_vector shape: (5, 40)
result2_vector shape: (5, 24)
result3_vector shape: (5, 8)
result4_vector shape: (5, 35)




# TF-IDF

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
 
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

vectorizer = CountVectorizer()    

X = vectorizer.fit_transform(corpus)       #先轉成 bag of words

word = vectorizer.get_feature_names()
print(word)

print(X.toarray())
 

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)    # 
tfidf_weight = tfidf.toarray() 
print(tfidf_weight)


for i in range(len(tfidf_weight)):
    print("-------output {}-th document tf-idf weight------".format(i))
    for j in range(len(word)):
        print(word[j],tfidf_weight[i][j])


['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]
-------output 0-th document tf-idf weight------
and 0.0
document 0.4387767428592343
first 0.5419765697264572
is 0.4387767428592343
one 0.0
second 0.0
the 0.35872873824808993
third 0.0
this 0.4387767428592343
-------output 1-th document tf-idf weight------
and 0.0
document 0.2723014675233404
first 0.0
is 0.2723014675233404
one 0.0
second 0.8532257361452786
the 0.22262429232510395
third 0.0
this 0.2723014675233404
-------output 2-th document tf

# N-Gram

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
bards_words =["The fool doth think he is wise",
              "but the wise man knows himself to be a fool"]

vect1 = CountVectorizer(ngram_range=(1, 1)).fit(bards_words)
print("Vocabulary size: {}".format(len(vect1.vocabulary_)))
print("Vocabulary:\n{}".format(vect1.get_feature_names()))

vect2 = CountVectorizer(ngram_range=(2, 2)).fit(bards_words)
print("Vocabulary size: {}".format(len(vect2.vocabulary_)))
print("Vocabulary:\n{}".format(vect2.get_feature_names()))
print("Transformed data (dense):\n{}".format(vect2.transform(bards_words).toarray()))

Vocabulary size: 13
Vocabulary:
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
Vocabulary size: 14
Vocabulary:
['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']
Transformed data (dense):
[[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]]


In [8]:
#https://medium.com/%E6%89%8B%E5%AF%AB%E7%AD%86%E8%A8%98/%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E8%99%95%E7%90%86-%E4%BD%BF%E7%94%A8-n-gram-%E5%AF%A6%E7%8F%BE%E8%BC%B8%E5%85%A5%E6%96%87%E5%AD%97%E9%A0%90%E6%B8%AC-10ac622aab7a

from collections import Counter, namedtuple
import json
import re

DATASET_DIR = 'dataset/WebNews.json'
with open(DATASET_DIR, encoding = 'utf8') as f:
    dataset = json.load(f)
    
seg_list = list(map(lambda d: d['detailcontent'], dataset))
rule = re.compile(r"[^\u4e00-\u9fa5]")
seg_list = [rule.sub('', seg) for seg in seg_list]
print(seg_list[0])

大年初六桃園八德大溪參香祈福祈求台灣平安桃園建設大步向前桃園市長鄭文燦今日上午前往桃園區清水巖下午前往八德區廣行宮大溪區中庄福德宮永安宮內柵仁安宮溪洲福山巖慈聖宮龍山寺參香並發送桃園福御守福袋給大年初六走春參香的市民朋友鄭市長表示大年初六是清水祖師聖誕也是開工的日子祈求清水祖師庇佑台灣平安健康武漢肺炎疫情不要蔓延到台灣也祈求桃園建設持續大步向前祝福所有鄉親信眾鼠來運轉今年的願望都能努力打拚實現鄭市長也呼籲市府將以高標準進行武漢肺炎防疫工作請市民朋友勤加洗手戴口罩量體溫如需前往人潮較多的地方記得要做好清潔消毒工作此外應避免聽信網路謠言造成恐慌亦可透過衛福部疾病管制署的疾管家獲知最新防疫資訊保護自身及周遭親友的健康安全鄭市長在中庄福德宮表示市府致力推動中庄地區發展中庄不只有調整池攔河堰中庄運動公園即將動工大漢溪邊也將興建堤防及防汛道路市民朋友無論在交通或觀光休憩都將更加便利另外國道號增設大鶯豐德交流道可行性研究已獲得交通部審議通過並陸續辦理相關建設計畫府會也將攜手合作讓交流道順利推動完成今日包括立法委員趙正宇市議員朱珍瑤呂林小鳳李柏坊陳治文黃家齊蔡永芳桃園工策會總幹事陳家濬市府民政局副局長林香美警察局督察長吳坤旭桃園區長陳玉明八德區長邱瑞朝大溪區長陳嘉聰桃園果菜市場公司董事長邱素芬大嵙崁文教基金會執行長李世明清水巖主委邱顯來廣行宮主委李秀明中庄福德宮主委沈琳容永安宮主委林繼雄內柵仁安宮主委簡子嚴溪洲福山巖主委楊賴傳慈聖宮主委蔡水木龍山寺董事長陳有盛等均一同參香


In [24]:
def ngram(documents, N=2):
    ngram_prediction = dict()
    total_grams = list()
    words = list()
    Word = namedtuple('Word', ['word', 'prob'])

    for doc in documents:
        split_words = ['<s>'] + list(doc) + ['</s>']
        # 計算分子
        [total_grams.append(tuple(split_words[i:i+N])) for i in range(len(split_words)-N+1)]
        # 計算分母
        [words.append(tuple(split_words[i:i+N-1])) for i in range(len(split_words)-N+2)]
        
    total_word_counter = Counter(total_grams)
    word_counter = Counter(words)
    
    for key in total_word_counter:
        word = ''.join(key[:N-1])
        if word not in ngram_prediction:
            ngram_prediction.update({word: set()})
            
        next_word_prob = total_word_counter[key]/word_counter[key[:N-1]]
        w = Word(key[-1], '{:.3g}'.format(next_word_prob))
        ngram_prediction[word].add(w)
        
    return ngram_prediction

In [10]:
tri_prediction = ngram(seg_list, N=3)
#print(tri_prediction)
print(dict(list(tri_prediction.items())[0:5]))
for word, ng in tri_prediction.items():
    tri_prediction[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

{'<s>大': {Word(word='嵙', prob='0.0323'), Word(word='腳', prob='0.0323'), Word(word='量', prob='0.0323'), Word(word='有', prob='0.0323'), Word(word='年', prob='0.419'), Word(word='園', prob='0.0968'), Word(word='溪', prob='0.355')}, '大年': {Word(word='初', prob='1')}, '年初': {Word(word='四', prob='0.109'), Word(word='第', prob='0.0182'), Word(word='期', prob='0.0182'), Word(word='五', prob='0.127'), Word(word='開', prob='0.0364'), Word(word='正', prob='0.0364'), Word(word='送', prob='0.0182'), Word(word='動', prob='0.0909'), Word(word='一', prob='0.0909'), Word(word='評', prob='0.0182'), Word(word='即', prob='0.0182'), Word(word='六', prob='0.0727'), Word(word='發', prob='0.0182'), Word(word='三', prob='0.0364'), Word(word='市', prob='0.0182'), Word(word='二', prob='0.0727'), Word(word='通', prob='0.0182'), Word(word='地', prob='0.0182'), Word(word='會', prob='0.0182'), Word(word='完', prob='0.109'), Word(word='將', prob='0.0182'), Word(word='與', prob='0.0182')}, '初六': {Word(word='是', prob='0.143'), Word(word='桃', p

{}

In [13]:
text = '韓國'
next_words = list(tri_prediction[text])[:10]
for next_word in next_words:
    print('next word: {}, probability: {}'.format(next_word.word, next_word.prob))

next word: 隊, probability: 0.2
next word: 首, probability: 0.143
next word: 日, probability: 0.0571
next word: 及, probability: 0.0571
next word: 明, probability: 0.0571
next word: 代, probability: 0.0571
next word: 許, probability: 0.0286
next word: 職, probability: 0.0286
next word: 朴, probability: 0.0286
next word: 語, probability: 0.0286


# CBOW & Skip-gram  
yield explain: https://pyzh.readthedocs.io/en/latest/the-python-yield-keyword-explained.html#id8

In [None]:
import gzip
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

data_file="./dataset/reviews_data.txt.gz"

with gzip.open ('./dataset/reviews_data.txt.gz', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

In [None]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    print("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                print("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (data_file))
print("Done reading data file")

In [None]:
documents[0]

In [None]:
'''
Word2Vec model parameters

size:
The size of the dense vector to represent each token or word. If you have very limited data, then size should be a much smaller value. If you have lots of data, its good to experiment with various sizes. A value of 100-150 has worked well for me.

window:
The maximum distance between the target word and its neighboring word. If your neighbor's position is greater than the maximum window width to the left and the right, then, some neighbors are not considered as being related to the target word. In theory, a smaller window should give you terms that are more related. If you have lots of data, then the window size should not matter too much, as long as its a decent sized window.

min_count:
Minimium frequency count of words. The model would ignore words that do not statisfy the min_count. Extremely infrequent words are usually unimportant, so its best to get rid of those. Unless your dataset is really tiny, this does not really affect the model.

workers:
How many threads to use behind the scenes?

sg: sg=1 means skip-gram and sg=0 menascbow
'''
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10, sg=0)
model.train(documents,total_examples=len(documents),epochs=10)

In [None]:
w1 = "dirty"
model.wv.most_similar (positive=w1)

In [None]:
# look up top 6 words similar to 'polite'
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
# look up top 6 words similar to 'france'
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
# get everything related to stuff on the bed
w1 = ["bed",'sheet','pillow']
w2 = ['couch']
model.wv.most_similar (positive=w1,negative=w2,topn=10)

In [None]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="smelly")

In [None]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["cat","dog","france"])

In [None]:
# print word vector
model.wv['dirty']

# Glove  
ref: https://github.com/maciejkula/glove-python 

In [None]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

# download the model and return as object ready for use
model_glove_twitter = api.load("glove-twitter-25")

model_glove_twitter.wv.most_similar("pelosi",topn=10)

In [None]:
model_glove_twitter.wv['dirty']

In [None]:
model_glove_twitter.wv.most_similar("policies",topn=10)

In [None]:
model_glove_twitter.wv.doesnt_match(["trump","bernie","obama","pelosi","orange"])

In [None]:
import gensim.downloader as api
#again, download and load the model
model_gigaword = api.load("glove-wiki-gigaword-100")

In [None]:
# find similarity
model_gigaword.wv.most_similar(positive=['dirty','grimy'],topn=10)

# Doc2Vect

In [None]:
#python example to train doc2vec model (with or without pre-trained word embeddings)

import gensim.models as g
import logging

#doc2vec parameters
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes

#pretrained word embeddings
pretrained_emb = "./dataset/toy_data/pretrained_word_embeddings.txt" #None if use without pretrained embeddings

#input corpus
train_corpus = "./dataset/toy_data/train_docs.txt"

#output model
saved_path = "./dataset/toy_data/model.bin"

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)
model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, iter=train_epoch)

#save model
model.save(saved_path)


In [None]:
#python example to infer document vectors from trained doc2vec model
import gensim.models as g
import codecs

#parameters
model="./dataset/toy_data/model.bin"
test_docs="./dataset/toy_data/test_docs.txt"
output_file="./dataset/toy_data/test_vectors.txt"

#inference hyper-parameters
start_alpha=0.01
infer_epoch=1000

#load model
m = g.Doc2Vec.load(model)
test_docs = [ x.strip().split() for x in codecs.open(test_docs, "r", "utf-8").readlines() ]

print('test docs:\n{}'.format(test_docs))
#infer test vectors
output = open(output_file, "w")
for d in test_docs:
    output.write( " ".join([str(x) for x in m.infer_vector(d, alpha=start_alpha, steps=infer_epoch)]) + "\n" )
output.flush()
output.close()
