In [1]:
import configparser
import pymongo
from pymongo import MongoClient
from datetime import datetime
import pickle
import re

import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

from helper import *

In [3]:
### init and read config
config = configparser.ConfigParser()
config.read('./config.ini')

MongoDB = config["ADM"]["Database"]
MongoUser = config["ADM"]["User"]
MongoPW = config["ADM"]["PW"]

###連接MONGO
uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" +\
MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.ComparableWiki

def update(target_collection, doc_id, pairString):
    target_collection.update_one({"_id": doc_id},
                      {
                          "$set":{
                              "pairString_N_V": pairString
                          }
                      })

def judge_pure_english(keyword):  
    return all(ord(c) < 128 for c in keyword)

def extract_selected_pos_word_from_jap(doc):
    # 挑取 #N, #V 詞性開頭的字
    indices = [i for i, s in enumerate(doc) if "#名詞" in s or "#動詞" in s]
    select_words = [doc[index] for index in indices]
    
    # 清除詞性標籤
    words = [re.search('(.*)#', word).group(1) for word in select_words]
    
    # 過濾單詞
    words = [word for word in words if len(word) >= 2]
    
    # 過濾英文字
    words = [word for word in words if judge_pure_english(word) == False]
    return(words)

In [6]:
# takes time
target_collection = db.ENJA
# num = target_collection.count_documents({"jap_nlp_process": {"$exists": True}, "nlp_process": {"$exists": True}})
# print("Number of Documents: %d" % num)

In [None]:
start_time = datetime.now()

query_documents = target_collection.find({"jap_nlp_process": {"$exists": True},
                                          "nlp_process": {"$exists": True}},
                                          no_cursor_timeout=True)

index = 0

for each_document in query_documents:
    
    # for each japanese document
    # there is no sentence level for japanese documents
    japanese_sentences = extract_selected_pos_word_from_jap(each_document["jap_result"]["pos"])
    
    # for each english document
    english_sentences = list()
    for each_sentence in each_document["nested_token_list"]:
        english_sentences += project_function_for_every_document(each_sentence, want_stop=False,
                                                                want_alpha=True, want_lemma=True,
                                                                accept_pos = ["NOUN", "VERB"],
                                                                use_entity=False)
    
    update(target_collection, each_document["_id"],
           " ".join(japanese_sentences) + " " + " ".join(english_sentences))
    """
    here, I directly store back into mongo
    because it cost a lot of memory footprint when picklize the data
    and I can further use generator(mongo cursur) to control the memory issue.
    """
    
    index += 1
    if(index % 2500 ==0):
        print("Already process %d documents" % index)
        time_elapsed = datetime.now() - start_time 
        print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

query_documents.close()
# Already process 392500 documents
# Time elapsed (hh:mm:ss.ms) 0:57:17.214408

# Create Vocabulary dictionary and term-document matrix(presence/absence) by gensim utilities

In [8]:
from gensim import corpora
from gensim.matutils import corpus2csc

query_documents = target_collection.find({"jap_nlp_process": {"$exists": True},
                                          "nlp_process": {"$exists": True}},{"pairString_N_V":1},
                                          no_cursor_timeout=True)

compound_dictionary = corpora.Dictionary((doc["pairString_N_V"].split() for doc in query_documents))

query_documents.close()

In [9]:
print("Original size of vocabs: ", len(compound_dictionary.token2id))
compound_dictionary.filter_extremes(no_below=100, no_above=0.3, keep_n=None)
compound_dictionary.compactify()
print("After shrinking, size of vocabs: ", len(compound_dictionary.token2id))

Original size of vocabs:  2047073
After shrinking, size of vocabs:  78765


In [10]:
%%time
query_documents = target_collection.find({"jap_nlp_process": {"$exists": True},
                                          "nlp_process": {"$exists": True}},{"pairString_N_V":1},
                                          no_cursor_timeout=True)

compund_corpus = [compound_dictionary.doc2bow(doc["pairString_N_V"].split()) for doc in query_documents]

query_documents.close()

CPU times: user 2min 13s, sys: 4.39 s, total: 2min 17s
Wall time: 10min 4s


In [11]:
%%time
term_document_matrix = corpus2csc(compund_corpus)

CPU times: user 18.1 s, sys: 737 ms, total: 18.8 s
Wall time: 18.8 s


In [12]:
%%time
term_document_matrix[term_document_matrix >= 1] = 1
print(type(term_document_matrix), term_document_matrix.shape)
#cooccurence_matrix = term_document_matrix @ term_document_matrix.T
#print(term_document_matrix.shape, cooccurence_matrix.shape)

<class 'scipy.sparse.csc.csc_matrix'> (78765, 393617)
CPU times: user 23.8 s, sys: 1.28 s, total: 25.1 s
Wall time: 23.4 s


In [36]:
compound_dictionary.token2id["compute"]
# investment 15652 投資 1660 => 1327
# government 2077 政府 6499 => 11586
# rate 3233 割合 4987 => 1223
# peacemaker 52758 政治家 3312
# terrorism 30739 軍事 3359 => 325
# compute 5759 計算 3353 => 874
# negative: dog 909

5759

In [40]:
(term_document_matrix[5759,:] @ term_document_matrix[1660,:].T).toarray()[0,0]

52.0

In [2]:
with open('wiki_en_jp_NounVerb_tdm.pickle', 'rb') as f:
    compound_dictionary, term_document_matrix = pickle.load(f)

In [14]:
with open('wiki_en_jp_NounVerb_tdm.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump((compound_dictionary, term_document_matrix), f, pickle.HIGHEST_PROTOCOL)