In [75]:
import configparser
import pymongo
from pymongo import MongoClient
import ast
import logging
import os
import re
import xml.etree.ElementTree as ET

In [17]:
### init and read config
config = configparser.ConfigParser()
config.read('../config.ini')

MongoDB = config["MLDoc"]["Database"]
MongoUser = config["MLDoc"]["User"]
MongoPW = config["MLDoc"]["PW"]

###連接MONGO
uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" +\
MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.MLDoc

# load back MLDoc Pre-define indexes

In [22]:
fo = open("/home/ponshane/work_dir/MLDoc/chinese.trian.10000.txt", "r")
 
for line in fo.readlines():
    
    line = line.split("\t")
    # 確認只有 class, content of document
    assert len(line) == 2
    # ast help to read in bytes string
    doc = ast.literal_eval(line[1]).decode('utf-8')
    db.Chinese.insert_one({"Class":line[0], "Content":doc})

fo.close()

# loop rcv2 specific language and store into Mongo

In [73]:
def parse_xml(rcv_dir, line):
    delim_str = '\t'
    sentence_delim = ' '
    code_class = 'bip:topics:1.0'
    labels = ['C', 'E', 'G', 'M']
    target_topics = ['{}CAT'.format(label) for label in labels]
    
    sub_corpus, file_name = line.strip().split('-')
    sub_corpus_path = os.sep.join([rcv_dir, sub_corpus])
    doc_path = os.sep.join(
        [sub_corpus_path, '{}.xml'.format(file_name)]
    )
    data_str = open(doc_path).read()
    try:
        xml_parsed = ET.fromstring(data_str)
        topics = [
            topic.attrib['code'] for topic in xml_parsed.findall(
                ".//codes[@class='{}']/code".format(code_class)
            ) if topic.attrib['code'] in target_topics
        ]
        assert len(topics) == 1, 'More than one class label found.'
        doc = sentence_delim.join(
            [p.text for p in xml_parsed.findall(".//p")]
        )
        
        return sub_corpus, file_name, topics[0], doc
    
    except Exception as e:
        logging.error('Failed to parse xml file: {}.'.format(doc_path))

In [None]:
# need to adjust following two inputs
regex = r"rcv1\/(.+).xml"
rcv2_path = "/home/ponshane/Desktop/rcv1"

for current_path, folder, files in os.walk(rcv2_path):
    for file in files:
        file_str = os.sep.join([current_path, file])
        matches = re.search(regex, file_str, re.DOTALL)
        if matches:    
            index = matches.group(1).replace("/","-")+"\n"
            
            try:
                sub_corpus, file_name, topic_code, doc = parse_xml(rcv2_path, index)
            except:
                continue
            
            db.English.insert_one({"Class":topic_code, "Content":doc,
                                  "Sub_corpus":sub_corpus, "File_name":file_name})
f.close()

# step1, generate_documents.py 
# step2, sampling_rcv2.py (可能要想想怎麼做，與後續實驗有關)

In [None]:
# testcase
sub_corpus, file_name, topic_code, doc = parse_xml("/home/ponshane/Desktop/RCV2_Multilingual_Corpus/chinese/", "FDCH14-29640")
print(sub_corpus, file_name, topic_code, doc)

#

In [None]:
# update function is designed to help pymongo update nlp results into Database

def update(target_collection, doc_id, sentences, nested_token_list, if_abstract=True):
    #Result = {"entity_list":entity_list, "chunk_list": chunk_list, "entity_controlled_list": entity_controlled_list, "token_list": token_list}
    #print(Result)
    if if_abstract:
        target_collection.update_one({"_id": doc_id},
                          {
                              "$set":{
                              "body_sentences": sentences,
                              "body_nested_token_list": nested_token_list,
                              "body_nlp_process": True
                              }
                          })
    else:
        target_collection.update_one({"_id": doc_id},
                          {
                              "$set":{
                              "sentences": sentences,
                              "nested_token_list": nested_token_list,
                              "nlp_process": True
                              }
                          })

def error_update(target_collection, doc_id, if_abstract=True):
    if if_abstract:
        target_collection.update_one({"_id": doc_id},
                          {
                              "$set":{
                              "abstract_nlp_error": True
                          }
                        })
    else:
        target_collection.update_one({"_id": doc_id},
                          {
                              "$set":{
                              "nlp_error": True
                          }
                        })