In [7]:
import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

import configparser
from datetime import datetime
from pymongo import MongoClient
from Sentence_Segmentation import Sentence_Segmentation
from Chinese_Tokenizer import Tokenizer
from Chinese_POSTagger import POSTagger
import re

In [8]:
### init and read config
config = configparser.ConfigParser()
config.read('../config.ini')

### connect to mongodb
MongoDB = config["MLDoc"]["Database"]
MongoUser = config["MLDoc"]["User"]
MongoPW = config["MLDoc"]["PW"]

uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" + MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.MLDoc

In [9]:
one_document = db.Chinese.find_one()
print(one_document)

chi_results = Sentence_Segmentation(one_document["Content"], rep_period_regexp="\.", keep_digits=False)
print(chi_results)

chi_tokens = Tokenizer(chi_results)
print(chi_tokens)

chi_pos = POSTagger(chi_tokens)
print(chi_pos)

{'_id': ObjectId('5bf419bdd3d28003f2b4057c'), 'Class': 'CCAT', 'Content': ' [路透社香港14日電]    中國最大的電子彩   管生產商--深圳賽格股份有限公司表示,今年上半年   因產量增加,預計97年上半年稅后利潤增幅約30%.   據深賽格96年報顯示,96年實現稅后利潤   1.42億元人民幣,96年中期實現6,017萬元.   該公司的一位人士對香港中國証券快訊表示,   盡管賽格公司實施了96年每10股送3股的分紅方案,但   97年上半年的利潤仍保持在0.29元左右.但他未說明是   根據國內會計標准抑或境外會計標准.   他指出,利潤增加的主要原因是公司產品產量   的提高和其它投資收益,包括已在深圳完工的諸運大   廈,不過他未給出具體細節.   深賽格2,120萬股A股于96年12月26日在深圳   掛牌.8,000萬B股亦于96年7月22日在深掛牌. (完)   (c) Reuters Limited 1997 ', 'Sub_corpus': 'FDCH14', 'File_name': '29566'}
['路透社香港日电中国最大的电子彩管生产商深圳赛格股份有限公司表示,今年上半年因产量增加,预计年上半年税后利润增幅约', '据深赛格年报显示,年实现税后利润', '亿元人民币,年中期实现,万元', '该公司的一位人士对香港中国証券快讯表示,尽管赛格公司实施了年每股送股的分红方案,但年上半年的利润仍保持在', '元左右', '但他未说明是根据国内会计标准抑或境外会计标准', '他指出,利润增加的主要原因是公司产品产量的提高和其它投资收益,包括已在深圳完工的诸运大厦,不过他未给出具体细节', '深赛格,万股A股于年月日在深圳挂牌', ',万B股亦于年月日在深挂牌', '完cReutersLimited']
[['路透社', '香港', '日电', '中国', '最大', '的', '电子', '彩管', '生产商', '深圳', '赛格', '股份', '有限公司', '表示', ',', '今年', '上半年', '因', '产量', '增加', ',', '预计', '年', '上半年', '税后', '利润', '增幅', '约'], ['据', '深赛格', '

In [19]:
# 適用於中文 NLP Process 的 Update，因為中英文產出欄位不同
def update(target_collection, doc_id, sentences, tokens, pos):
    Result = {"sentences":sentences, "tokens": tokens, "pos": pos}
    target_collection.update_one({"_id": doc_id},
                      {
                          "$set":{
                              "chi_result": Result,
                              "chi_nlp_process": True
                          }
                      })

In [21]:
target_collection = db.Chinese
num = target_collection.count({"chi_nlp_process": {"$exists": False}})
print("{0} documents need to be processed.".format(num))

24533 documents need to be processed.


In [22]:
docs = target_collection.find({"chi_nlp_process":{"$exists": False}},{"_id":1, "Content":1}, no_cursor_timeout=True)

# improve version
start_time = datetime.now()

error_list = list()
index = 0

for each_document in docs:
    sentences = Sentence_Segmentation(each_document["Content"], rep_period_regexp="\.", keep_digits=False)
    tokens = Tokenizer(sentences)
    pos = POSTagger(tokens)
    
    try:
        update(target_collection, each_document["_id"], sentences, tokens, pos)
    except:
        error_list.append(each_document["_id"])
    
    index += 1
    if(index % 1000 ==0):
        print("Already process %d documents" % index)

print("{0} documents got some problems".format(len(error_list)))
print(error_list)

docs.close()

time_elapsed = datetime.now() - start_time

print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

Already process 1000 documents
Already process 2000 documents
Already process 3000 documents
Already process 4000 documents
Already process 5000 documents
Already process 6000 documents
Already process 7000 documents
Already process 8000 documents
Already process 9000 documents
Already process 10000 documents
Already process 11000 documents
Already process 12000 documents
Already process 13000 documents
Already process 14000 documents
Already process 15000 documents
Already process 16000 documents
Already process 17000 documents
Already process 18000 documents
Already process 19000 documents
Already process 20000 documents
Already process 21000 documents
Already process 22000 documents
Already process 23000 documents
Already process 24000 documents
0 documents got some problems
[]
Time elapsed (hh:mm:ss.ms) 0:22:21.705841
