In [18]:
import configparser
import pymongo
from pymongo import MongoClient
from datetime import datetime
import pickle
import re
import gensim

import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

from helper import *

### init and read config
config = configparser.ConfigParser()
config.read('../config.ini')

MongoDB = config["MLDoc"]["Database"]
MongoUser = config["MLDoc"]["User"]
MongoPW = config["MLDoc"]["PW"]

###連接MONGO
uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" + MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.MLDoc

# For Chinese

In [1]:
### 檢查單字是否都是 英文字/數字 http://hzy3774.iteye.com/blog/2359032
def judge_pure_english(keyword):  
    return all(ord(c) < 128 for c in keyword)

def extract_selected_pos_word(sentence):
    # 挑取 #N, #V 詞性開頭的字
    indices = [i for i, s in enumerate(sentence) if "#N" in s or "#V" in s]
    select_words = [sentence[index] for index in indices]
    
    # 清除詞性標籤
    words = [re.search('(.*)#', word).group(1) for word in select_words]
    
    # 過濾單詞
    words = [word for word in words if len(word) >= 2]
    
    # 過濾英文字
    words = [word for word in words if judge_pure_english(word) == False]
    return(words)

In [6]:
# convert your date string to datetime object
target_collection = db.Chinese
num = target_collection.count({"chi_nlp_process": {"$exists": True}})
print("Number of Chinese Documents: %d" % num)

Number of Chinese Documents: 24533


In [None]:
start_time = datetime.now()

query_documents = target_collection.find({"chi_nlp_process": {"$exists": True}}, no_cursor_timeout=True)

sentence = list()
index = 0

for each_document in query_documents:
    for each_sentence in each_document["chi_result"]["pos"]:
        sentence.append(extract_selected_pos_word(each_sentence))
    
    index += 1
    if(index % 1000 ==0):
        print("Already process %d documents" % index)

query_documents.close()

time_elapsed = datetime.now() - start_time 

print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
# Time elapsed (hh:mm:ss.ms) 0:00:18.737113

In [11]:
sentence[:5]

[['路透社',
  '香港',
  '中国',
  '电子',
  '彩管',
  '生产商',
  '深圳',
  '赛格',
  '股份',
  '有限公司',
  '表示',
  '今年',
  '上半年',
  '产量',
  '增加',
  '预计',
  '上半年',
  '利润',
  '增幅'],
 ['年报', '显示', '实现', '利润'],
 ['亿元', '人民币', '中期', '实现', '万元'],
 ['公司',
  '人士',
  '香港',
  '中国',
  '快讯',
  '表示',
  '赛格',
  '公司',
  '实施',
  '每股',
  '送股',
  '分红',
  '方案',
  '上半年',
  '利润',
  '保持'],
 []]

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

model = gensim.models.Word2Vec(sentence, size=100, window=5,
                               min_count=15, workers=4, negative =10)

In [14]:
model.wv.most_similar("利润")

[('净利润', 0.8554553985595703),
 ('多万元', 0.8341077566146851),
 ('盈利', 0.8136324882507324),
 ('利润总额', 0.7878793478012085),
 ('收入', 0.7827474474906921),
 ('销售收入', 0.7672287225723267),
 ('投资收益', 0.7518516182899475),
 ('亿万元', 0.745165228843689),
 ('扭亏', 0.7345239520072937),
 ('利税', 0.724858283996582)]

In [16]:
# expoert to embedding file
file_path = "../out/MLDoc/Chinese_embeddings.tsv"
out = open(file_path,'w')

for each_word in model.wv.index2word:
    out.write('\t'.join(map(str, model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "../out/MLDoc/Chinese_metadata.tsv"
out = open(file_path,'w')
for each_word in model.wv.index2word:
    out.write(each_word+"\n")

out.close()

model.save("../out/MLDoc/MLDoc-chinese-word2vec_NV_s100w5m15n10.vec")

2018-11-22 10:30:07,962 : INFO : saving Word2Vec object under ../out/MLDoc/MLDoc-chinese-word2vec_NV_s100w5m15n10.vec, separately None
2018-11-22 10:30:07,963 : INFO : not storing attribute vectors_norm
2018-11-22 10:30:07,964 : INFO : not storing attribute cum_table
2018-11-22 10:30:08,015 : INFO : saved ../out/MLDoc/MLDoc-chinese-word2vec_NV_s100w5m15n10.vec


# For English

In [None]:
start_time = datetime.now()

target_collection = db.English
docs = target_collection.find({"nlp_process": {"$exists": True}}, no_cursor_timeout=True)

index = 0
sentence_list = []
id_mongo_dictionary = {}

for doc in docs:

    for each_sentence in doc["nested_token_list"]:
        tokens_from_each_sentence = project_function_for_every_document(each_sentence, want_stop=False,
                                                                        want_alpha=True, want_lemma=True,
                                                                        accept_pos = ["NOUN", "VERB"],
                                                                        use_entity=False)
        sentence_list.append(tokens_from_each_sentence)

    index += 1
    if(index % 5000 ==0):
        print("Already process %d documents" % index)
        
docs.close()

time_elapsed = datetime.now() - start_time

print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
# Time elapsed (hh:mm:ss.ms) 0:35:12.122783

In [21]:
#pickle.dump( sentence_list, open( "../out/MLDoc/MLDoc_Eng_Sentences.pkl", "wb" ) )
len(sentence_list)

9592033

In [None]:
model = gensim.models.Word2Vec(sentence_list, size=100, window=5,
                               min_count=15, workers=4, negative =10, sample = 1e-5)
# it takes about 4min

In [36]:
model.wv.most_similar("oil")

[('crude', 0.7762783765792847),
 ('gas', 0.7549433708190918),
 ('barrel', 0.7484706044197083),
 ('petroleum', 0.7382102012634277),
 ('liquefy', 0.7294834852218628),
 ('kilolitre', 0.7153576612472534),
 ('refinery', 0.7033140659332275),
 ('refiner', 0.6968631744384766),
 ('oilfield', 0.6891987919807434),
 ('mdo', 0.678256630897522)]

In [37]:
# expoert to embedding file
file_path = "../out/MLDoc/English_embeddings.tsv"
out = open(file_path,'w')

for each_word in model.wv.index2word:
    out.write('\t'.join(map(str, model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "../out/MLDoc/English_metadata.tsv"
out = open(file_path,'w')
for each_word in model.wv.index2word:
    out.write(each_word+"\n")

out.close()

model.save("../out/MLDoc/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.vec")

2018-11-22 18:53:22,487 : INFO : saving Word2Vec object under ../out/MLDoc/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.vec, separately None
2018-11-22 18:53:22,487 : INFO : not storing attribute vectors_norm
2018-11-22 18:53:22,488 : INFO : not storing attribute cum_table
2018-11-22 18:53:22,657 : INFO : saved ../out/MLDoc/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.vec


# For Japanese
Notice that we don't use sentence segmentation here for the limitation of Mecab.

In [40]:
def extract_selected_pos_word_from_jap(doc):
    # 挑取 #N, #V 詞性開頭的字
    indices = [i for i, s in enumerate(doc) if "#名詞" in s or "#動詞" in s]
    select_words = [doc[index] for index in indices]
    
    # 清除詞性標籤
    words = [re.search('(.*)#', word).group(1) for word in select_words]
    
    # 過濾單詞
    words = [word for word in words if len(word) >= 2]
    
    # 過濾英文字
    words = [word for word in words if judge_pure_english(word) == False]
    return(words)

In [41]:
# convert your date string to datetime object
target_collection = db.Japanese
num = target_collection.count({"jap_nlp_process": {"$exists": True}})
print("Number of Chinese Documents: %d" % num)

Number of Chinese Documents: 58599


In [None]:
start_time = datetime.now()

query_documents = target_collection.find({"jap_nlp_process": {"$exists": True}}, no_cursor_timeout=True)

jap_sentences = list()
index = 0

for each_document in query_documents:
    jap_sentences.append(extract_selected_pos_word_from_jap(each_document["jap_result"]["pos"]))
    
    index += 1
    if(index % 1000 ==0):
        print("Already process %d documents" % index)

query_documents.close()

time_elapsed = datetime.now() - start_time 

print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
# Time elapsed (hh:mm:ss.ms) 0:00:18.737113

In [None]:
jap_sentences[:5]

In [None]:
model = gensim.models.Word2Vec(jap_sentences, size=100, window=5,
                               min_count=15, workers=4, negative =10, sample=1e-4)
# it takes about 4min

In [55]:
model.wv.most_similar("投資")

[('外債', 0.6510068774223328),
 ('株式投資', 0.6410746574401855),
 ('ポートフォリオ', 0.6040748357772827),
 ('ベンチャーキャピタル', 0.6006470918655396),
 ('配分', 0.58954918384552),
 ('魅力', 0.5849771499633789),
 ('外国', 0.5676727294921875),
 ('直接投資', 0.5562580823898315),
 ('シフト', 0.5470646619796753),
 ('エマージング', 0.5427674055099487)]

In [56]:
# expoert to embedding file
file_path = "../out/MLDoc/Japanese_embeddings.tsv"
out = open(file_path,'w')

for each_word in model.wv.index2word:
    out.write('\t'.join(map(str, model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "../out/MLDoc/Japanese_metadata.tsv"
out = open(file_path,'w')
for each_word in model.wv.index2word:
    out.write(each_word+"\n")

out.close()

model.save("../out/MLDoc/MLDoc-japanese-word2vec_NV_s100w5m15n10sam1e-4.vec")

2018-11-22 19:28:59,836 : INFO : saving Word2Vec object under ../out/MLDoc/MLDoc-japanese-word2vec_NV_s100w5m15n10sam1e-4.vec, separately None
2018-11-22 19:28:59,837 : INFO : not storing attribute vectors_norm
2018-11-22 19:28:59,837 : INFO : not storing attribute cum_table
2018-11-22 19:28:59,918 : INFO : saved ../out/MLDoc/MLDoc-japanese-word2vec_NV_s100w5m15n10sam1e-4.vec
