In [1]:
import re
import html
import xml.etree.ElementTree as etree

# Process the raw xml of WIKI EN-ZH 405K pairs

In [2]:
def collapse_white_spaces(txt):
    """Collapse multiple white spaces into one white space
    (from lazyNLP)
    """
    clean_txt = ''
    prev = None
    for c in txt:
        if c == ' ' and prev == ' ':
            continue
        else:
            clean_txt += c
        prev = c
    return clean_txt

def connect_lines(txt, line_sep='\n'):
    """ This happens when you crawl text from a webpage and
    they have random breaking lines mid-sentence.
    This function is to connect those lines.
    Two consecutive lines are seperated by line_sep.
    (from lazyNLP)
    """
    lines = txt.split('\n')

    result, curr = '', ''
    for line in lines:
        line = line.strip()
        if not line:
            if curr:
                result += (curr + '\n')
            result += line_sep
            curr = ''
        else:
            curr += (line + ' ')

    return result + curr

def clean_html(txt):
    """ Clean HTML tags of webpages downloaded
    Use this function for Gutenberg book format.
    (from lazyNLP)
    """
    style_tag_re = re.compile('<style.*?>[^<>]*?</style>')
    txt = re.sub(style_tag_re, '', txt)
    script_tag_re = re.compile('<script.*?>[^<>]*?</script>')
    txt = re.sub(script_tag_re, '', txt)
    doc_tag_re = re.compile('<!DOCTYPE[^<>]*?>')
    txt = re.sub(doc_tag_re, '', txt)
    html_tag_re = re.compile('<.*?>')
    txt = connect_lines(txt)
    txt = collapse_white_spaces(txt)
    
    return html.unescape(re.sub(html_tag_re, '', txt).strip())

def extract_EN_ZH_documents_from_pair(pair):
    en_regex = r"<article lang=\"en\"(\n|.)*<\/article>\n"
    matches = re.finditer(en_regex, pair, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        en = clean_html(match.group())
    
    zh_regex = r"<article lang=\"zh\"(\n|.)*<\/article>"
    matches = re.finditer(zh_regex, pair, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        zh = clean_html(match.group())
    return en, zh

def extract_EN_JA_documents_from_pair(pair):
    en_regex = r"<article lang=\"en\"(\n|.)*<\/article>\n"
    matches = re.finditer(en_regex, pair, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        en = clean_html(match.group())
    
    ja_regex = r"<article lang=\"ja\"(\n|.)*<\/article>"
    matches = re.finditer(ja_regex, pair, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        ja = clean_html(match.group())
    return en, ja

In [15]:
# swallow my memory
"""
file = "/home/ponshane/Downloads/wikicomp-2014_enzh.xml"
handler = open(file)
html_string = ""
for line in handler:
    html_string += line
handler.close()
"""

# bs4 does not support iterparse... also takes all memory

In [None]:
""" wikicomp-2014_enzh.xml
fname = "/home/ponshane/Downloads/wikicomp-2014_enzh.xml"
EN_Articles = []
ZH_Articles = []
with open(fname) as f:
    temp_pair = ""
    collect_flag = False
    for line in f:
        if "</articlePair>" in line:
            collect_flag = False
            en, zh = extract_EN_ZH_documents_from_pair(temp_pair)
            EN_Articles.append(en)
            ZH_Articles.append(zh)
            assert len(EN_Articles) == len(ZH_Articles)
            if len(ZH_Articles) % 10000 == 0:
                print("Have processed: ", len(ZH_Articles))
            temp_pair = ""
            #break
        elif "<articlePair id=" in line:
            collect_flag = True
        elif collect_flag == True:
            temp_pair += line
"""

fname = "/home/ponshane/Downloads/wikicomp-2014_enja.xml"
EN_Articles = []
JA_Articles = []
with open(fname) as f:
    temp_pair = ""
    collect_flag = False
    for line in f:
        if "</articlePair>" in line:
            collect_flag = False
            en, ja = extract_EN_JA_documents_from_pair(temp_pair)
            EN_Articles.append(en)
            JA_Articles.append(ja)
            assert len(EN_Articles) == len(JA_Articles)
            if len(JA_Articles) % 10000 == 0:
                print("Have processed: ", len(JA_Articles))
            temp_pair = ""
            #break
        elif "<articlePair id=" in line:
            collect_flag = True
        elif collect_flag == True:
            temp_pair += line

In [4]:
import pickle

# en_zh
# with open('wiki_en_zh_405k.pickle', 'wb') as f:
#     # Pickle the 'data' dictionary using the highest protocol available.
#     pickle.dump((EN_Articles, ZH_Articles), f, pickle.HIGHEST_PROTOCOL)

# en_ja
with open('wiki_en_ja_393k.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump((EN_Articles, JA_Articles), f, pickle.HIGHEST_PROTOCOL)

# Store in Mongo

In [5]:
import configparser
import pymongo
from pymongo import MongoClient

### init and read config
config = configparser.ConfigParser()
config.read('./config.ini')

MongoDB = config["ADM"]["Database"]
MongoUser = config["ADM"]["User"]
MongoPW = config["ADM"]["PW"]

###連接MONGO
uri = "mongodb://" + MongoUser + ":" + MongoPW + "@140.117.69.70:30241/" +\
MongoDB + "?authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
db = client.ComparableWiki

In [6]:
# en_zh
# assert len(EN_Articles) == len(ZH_Articles)
# for idx, _ in enumerate(EN_Articles):
#     db.ENZH.insert_one({"PairNumber":idx, "EN-Content":EN_Articles[idx], "ZH-Content":ZH_Articles[idx]})
#     if idx % 10000 == 0:
#         print("Have stored: ", idx)

assert len(EN_Articles) == len(JA_Articles)
for idx, _ in enumerate(EN_Articles):
    db.ENJA.insert_one({"PairNumber":idx, "EN-Content":EN_Articles[idx], "JA-Content":JA_Articles[idx]})
    if idx % 10000 == 0:
        print("Have stored: ", idx)

Have stored:  0
Have stored:  10000
Have stored:  20000
Have stored:  30000
Have stored:  40000
Have stored:  50000
Have stored:  60000
Have stored:  70000
Have stored:  80000
Have stored:  90000
Have stored:  100000
Have stored:  110000
Have stored:  120000
Have stored:  130000
Have stored:  140000
Have stored:  150000
Have stored:  160000
Have stored:  170000
Have stored:  180000
Have stored:  190000
Have stored:  200000
Have stored:  210000
Have stored:  220000
Have stored:  230000
Have stored:  240000
Have stored:  250000
Have stored:  260000
Have stored:  270000
Have stored:  280000
Have stored:  290000
Have stored:  300000
Have stored:  310000
Have stored:  320000
Have stored:  330000
Have stored:  340000
Have stored:  350000
Have stored:  360000
Have stored:  370000
Have stored:  380000
Have stored:  390000
