In [4]:
import configparser
import pymongo
from pymongo import MongoClient
from datetime import datetime
import pickle
import re

import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

from helper import *

In [5]:
### init and read config
config = configparser.ConfigParser()
config.read('../config.ini')

MongoDB = config["ADM"]["Database"]
MongoUser = config["ADM"]["User"]
MongoPW = config["ADM"]["PW"]

###連接MONGO
uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" +\
MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.ComparableWiki

In [6]:
### 檢查單字是否都是 英文字/數字 http://hzy3774.iteye.com/blog/2359032
def judge_pure_english(keyword):  
    return all(ord(c) < 128 for c in keyword)

def extract_selected_pos_word(sentence):
    # 挑取 #N, #V 詞性開頭的字
    indices = [i for i, s in enumerate(sentence) if "#N" in s or "#V" in s]
    select_words = [sentence[index] for index in indices]
    
    # 清除詞性標籤
    words = [re.search('(.*)#', word).group(1) for word in select_words]
    
    # 過濾單詞
    words = [word for word in words if len(word) >= 2]
    
    # 過濾英文字
    words = [word for word in words if judge_pure_english(word) == False]
    return(words)

In [7]:
target_collection = db.ENZH
num = target_collection.count({"chi_nlp_process": {"$exists": True}, "nlp_process": {"$exists": True}})
print("Number of Chinese Documents: %d" % num)

Number of Chinese Documents: 405574


In [8]:
def update(target_collection, doc_id, pairString):
    target_collection.update_one({"_id": doc_id},
                      {
                          "$set":{
                              "pairString_N_V": pairString
                          }
                      })

# Select Nouns and Verbs from Chinese and English wiki corpus

In [None]:
start_time = datetime.now()

query_documents = target_collection.find({"chi_nlp_process": {"$exists": True},
                                          "nlp_process": {"$exists": True}},
                                          no_cursor_timeout=True)

index = 0

for each_document in query_documents:
    
    # for each chinese document
    chinese_sentences = list()
    for each_sentence in each_document["chi_result"]["pos"]:
        chinese_sentences += extract_selected_pos_word(each_sentence)
    
    # for each english document
    english_sentences = list()
    for each_sentence in each_document["nested_token_list"]:
        english_sentences += project_function_for_every_document(each_sentence, want_stop=False,
                                                                want_alpha=True, want_lemma=True,
                                                                accept_pos = ["NOUN", "VERB"],
                                                                use_entity=False)
    
    update(target_collection, each_document["_id"],
           " ".join(chinese_sentences) + " " + " ".join(english_sentences))
    """
    here, I directly store back into mongo
    because it cost a lot of memory footprint when picklize the data
    and I can further use generator(mongo cursur) to control the memory issue.
    """
    
    index += 1
    if(index % 2500 ==0):
        print("Already process %d documents" % index)
        time_elapsed = datetime.now() - start_time 
        print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

query_documents.close()
# Time elapsed (hh:mm:ss.ms) 1:41:14.716752

# Create Vocabulary dictionary and term-document matrix(presence/absence) by gensim utilities

In [11]:
from gensim import corpora
from gensim.matutils import corpus2csc

query_documents = target_collection.find({"chi_nlp_process": {"$exists": True},
                                          "nlp_process": {"$exists": True}},{"pairString_N_V":1},
                                          no_cursor_timeout=True)

compound_dictionary = corpora.Dictionary((doc["pairString_N_V"].split() for doc in query_documents))

query_documents.close()

In [23]:
print("Original size of vocabs: ", len(compound_dictionary.token2id))
compound_dictionary.filter_extremes(no_below=100, no_above=0.3, keep_n=None)
compound_dictionary.compactify()
print("After shrinking, size of vocabs: ", len(compound_dictionary.token2id))

Original size of vocabs:  2042465
After shrinking, size of vocabs:  56949


In [24]:
%%time
query_documents = target_collection.find({"chi_nlp_process": {"$exists": True},
                                          "nlp_process": {"$exists": True}},{"pairString_N_V":1},
                                          no_cursor_timeout=True)

compund_corpus = [compound_dictionary.doc2bow(doc["pairString_N_V"].split()) for doc in query_documents]

query_documents.close()

CPU times: user 2min 55s, sys: 5.06 s, total: 3min
Wall time: 6min 1s


In [25]:
%%time
term_document_matrix = corpus2csc(compund_corpus)

CPU times: user 13.5 s, sys: 696 ms, total: 14.2 s
Wall time: 14.2 s


In [None]:
%%time
term_document_matrix[term_document_matrix >= 1] = 1
print(type(term_document_matrix), term_document_matrix.shape)
#cooccurence_matrix = term_document_matrix @ term_document_matrix.T
#print(term_document_matrix.shape, cooccurence_matrix.shape)

In [41]:
(term_document_matrix[1,:] @ term_document_matrix[2,:].T).toarray()[0,0]

2705.0

In [42]:
with open('wiki_en_zh_NounVerb_tdm.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump((compound_dictionary, term_document_matrix), f, pickle.HIGHEST_PROTOCOL)