# Emvironment setting

In [3]:
import pandas as pd
import numpy as np

import warnings

import codecs
import jieba
from gensim.models import word2vec
from gensim.models import FastText
from glove import Glove, Corpus

warnings.filterwarnings("ignore")
stopwords = [line.strip() for line in codecs.open(
    r'./stopwords.txt', 'r', 'utf-8').readlines()]

# Prepare text

In [4]:
text = pd.read_csv(r'../../Data/TRAINSET_NEWS.csv')
text.head()

Unnamed: 0,id,date,title,content
0,20140414_00,20140414,习近平在空军机关调研时强调 加快建设一支空天一体攻防兼备的强大人民空军 为实现中国梦强军梦提...,中共中央总书记、国家主席、中央军委主席习近平14日专程到空军机关就空军建设和军事斗争准备进行...
1,20140414_01,20140414,利比亚临时政府总理辞职,本月8号刚刚被正式任命为利比亚临时政府总理的阿卜杜拉·萨尼13号发表声明说，12号晚他和家人...
2,20140414_02,20140414,关注乌克兰局势,代行乌克兰总统职责的乌克兰议长图尔奇诺夫13号发表讲话，要求占领东部地方政府建筑的抗议者，在...
3,20140414_03,20140414,国内联播快讯,低碳中国行活动正式启动由国家发展改革委等部门共同发起的低碳中国行活动今天正式启动，活动以引导...
4,20140414_04,20140414,刘汉等36人涉黑案继续开庭审理,刘汉、刘维等36人涉嫌犯组织、领导、参加黑社会性质组织罪以及故意杀人罪、包庇、纵容黑社会性质...


# Word segment

In [None]:
text_data = text[['date', 'title', 'content']]
text_data.head()

In [None]:
def seg_func(x):
    return list(filter(lambda w: w not in stopwords and w != ' ', jieba.cut(x, cut_all=False)))

In [None]:
text_data['title'] = (text_data['title'].astype(str)).apply(seg_func)
text_data['content'] = (text_data['content'].astype(str)).apply(seg_func)
text_data.head()

In [None]:
text_data.to_csv(r'./Word_Embedding_Model/seg_words.csv',index=False)

# Train word to vector

In [None]:
text_train_vec = list(text_data[['title', 'content']].values.reshape(1, -1)[0])

## Word2vec

### Word2vec Train

In [None]:
w2vmodel = word2vec.Word2Vec(text_train_vec, size=100,
                             window=5, min_count=0, workers=-1, sg=1, hs=0, negative=5)
w2vmodel.save('./Word_Embedding_Model/w2v.model')

### Word2vec Update

In [None]:
# Update word2vec Model
def w2v_retrain(data_file, old_model_file, new_model_file):
    sents = data_file
    model = word2vec.Word2Vec.load(old_model_file)
    model.build_vocab(sents, update=True)
    model.train(sents, total_examples=model.corpus_count, epochs=model.iter)
    model.save(new_model_file)

## GloVe

### GloVe train

In [None]:
corpus_model = Corpus()  # Update word2vec Model
corpus_model.fit(text_train_vec, window=5)
corpus_model.save('./Word_Embedding_Model/glov_corpus.model')
print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)

In [None]:
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10,
          no_threads=4, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
glove.save('./Word_Embedding_Model/glove.model')

## Fasttext

### Fasttext train

In [None]:
fttmodel = FastText(text_train_vec, size=100, window=5,
                    min_count=0, workers=-1, sg=1, hs=0)
fttmodel.save('./Word_Embedding_Model/fasttext.model')

### Fasttext update

In [None]:
# Update Fasttext Model
def ftt_retrain(data_file, old_model_file, new_model_file):
    sents = data_file
    model = FastText.load(old_model_file)
    model.build_vocab(sents, update=True)
    model.train(sents, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(new_model_file)

# Test

In [None]:
# 习近平
print(w2vmodel['习近平'])
print(glove.word_vectors[glove.dictionary['习近平']])
print(fttmodel['习近平'])