In [1]:
import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

import configparser
from datetime import datetime
from pymongo import MongoClient
#from Sentence_Segmentation import Sentence_Segmentation
import MeCab
import re

In [47]:
regex = r"(.+)\t(.+),"

def parse_each_word_feature(test_str):
    #print(test_str)
    matches = re.search(regex, test_str, re.DOTALL)

    if matches:
        return matches.group(1), matches.group(1)+"#"+matches.group(2).split(",")[0]

In [53]:
mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

sepical_symbols = '[＂<>:《》+\-=#$%&()*@＃＄％＆＇\(\)\[\]\{\}（）＊＋－／：\
            ＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏・━┿│┷┯．−]+'

def tokenization_and_pos(article):
    
    # this try block is to remove special characters and change breakline symbol
    try:
        article = re.sub(sepical_symbols,'',article)
    except:
        return "RegExp Error!"
        
    article = re.sub('[\s\d]+','', article) #remove space & digits
    
    # document level
    segmented_words = []
    pos_words = []
        
    for each_word_raw_str in mt.parse(article).split("\n")[:-1]:
        # word level
        if "EOS" not in each_word_raw_str:
            word, pos = parse_each_word_feature(each_word_raw_str)
            segmented_words.append(word)
            pos_words.append(pos)
    
    return segmented_words, pos_words

In [54]:
### init and read config
config = configparser.ConfigParser()
config.read('../config.ini')

### connect to mongodb
MongoDB = config["MLDoc"]["Database"]
MongoUser = config["MLDoc"]["User"]
MongoPW = config["MLDoc"]["PW"]

uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" + MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.MLDoc

In [None]:
one_document = db.Japanese.find_one()
#print(one_document)

#chi_results = Sentence_Segmentation(one_document["Content"], keep_digits=False)
print(one_document["Content"])
segmented_words, pos_words = tokenization_and_pos(one_document["Content"])
print(segmented_words, pos_words)

In [60]:
# 適用於中文 NLP Process 的 Update，因為中英文產出欄位不同
def update(target_collection, doc_id, tokens, pos):
    Result = {"tokens": tokens, "pos": pos}
    target_collection.update_one({"_id": doc_id},
                      {
                          "$set":{
                              "jap_result": Result,
                              "jap_nlp_process": True
                          }
                      })

In [62]:
target_collection = db.Japanese
num = target_collection.count({"jap_nlp_process": {"$exists": False}})
print("{0} documents need to be processed.".format(num))

0 documents need to be processed.


In [None]:
docs = target_collection.find({"jap_nlp_process":{"$exists": False}},{"_id":1, "Content":1}, no_cursor_timeout=True)

# improve version
start_time = datetime.now()

error_list = list()
index = 0

for each_document in docs:
    tokens, pos = tokenization_and_pos(each_document["Content"])
    
    try:
        update(target_collection, each_document["_id"], tokens, pos)
    except:
        error_list.append(each_document["_id"])
    
    index += 1
    if(index % 1000 ==0):
        print("Already process %d documents" % index)

print("{0} documents got some problems".format(len(error_list)))
print(error_list)

docs.close()

time_elapsed = datetime.now() - start_time

print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))