In [None]:
import xml.etree.ElementTree as ET
import spacy
import numpy as np
import pandas as pd
import fastText

# source

In [None]:
# load text from file
def load_text(file_path, encoding='UTF-8'):
    
    docs = []
    with open(file_path, 'r', encoding=encoding) as f:
        for line in f:
            docs.append(line.strip())
    
    return docs

In [None]:
# get language code by fasttext
def get_lang(docs, model_path, kmost=1):

    lang_classifier = fastText.load_model(model_path)

    return lang_classifier.predict(docs, kmost)

In [None]:
# save text to file
def save_text(text, file_path, encoding='UTF-8'):
    # write to output
    with open(file_path, 'w', encoding=encoding) as f:
        f.write('\n'.join(text))
        f.write('\n')

# filter

In [None]:
# remove duplicate pair
def pair_deduplicate(doc_left, doc_right):
    
    doc_left_clean = []
    doc_right_clean = []
    
    # already exist
    doc_contain = set()
    
    for dl, dr in zip(doc_left, doc_right):
        if (dl not in doc_contain) and (dr not in doc_contain):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        
        doc_contain.add(dl)
        doc_contain.add(dr)
        
    print('remove duplicate from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

In [None]:
# remove mislength pair
def pair_mislength(doc_left, doc_right, ratio=1.8):
    
    doc_left_clean = []
    doc_right_clean = []
    
    for dl, dr in zip(doc_left, doc_right):
        
        ll = len(dl.split())
        lr = len(dr.split())
        
        if (ll < lr*ratio) and (lr < ll*ratio):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        elif (ll < 8 or lr < 8) and (abs(ll-lr) < 8):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        
    print('remove mislength from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

In [None]:
# remove mislang pair
def pair_mislang(doc_left, doc_right, lang_left, lang_right, model_path, kmost=2):
    
    res_left = get_lang(doc_left, model_path, kmost)
    res_right = get_lang(doc_right, model_path, kmost)
    
    doc_left_clean = []
    doc_right_clean = []
    
    for idx in range(len(doc_left)):
        
        rl = [l.split('__')[-1] for l in res_left[0][idx]]
        rr = [l.split('__')[-1] for l in res_right[0][idx]]
        
        if (lang_left in rl) and (lang_right in rr):
            doc_left_clean.append(doc_left[idx])
            doc_right_clean.append(doc_right[idx])
    
    print('remove mislength from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

# europarl

In [None]:
europarl_en = load_text('./data/raw/europarl/europarl-v7.fr-en.en')
europarl_fr = load_text('./data/raw/europarl/europarl-v7.fr-en.fr')

In [None]:
temp_en, temp_fr = pair_deduplicate(europarl_en, europarl_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
europarl_en_clean, europarl_fr_clean = temp_en, temp_fr

In [None]:
sid, win = 45000, 3
for idx in range(sid, sid+win):
    print(europarl_en_clean[idx], '||', europarl_fr_clean[idx])

In [None]:
save_text(europarl_en_clean, './data/clean/europarl.en')
save_text(europarl_fr_clean, './data/clean/europarl.fr')

# tatoeba

In [None]:
dfBase = pd.read_csv('./data/raw/tatoeba/sentences.csv', sep='\t', header=None, names=['label', 'lang', 'text'])
dfBase = dfBase.set_index('label')
dfLink = pd.read_csv('./data/raw/tatoeba/links.csv', sep='\t', header=None, names=['ida', 'idb'])

In [None]:
# en fr sentences label
label_en = set(dfBase[dfBase['lang']=='eng'].index.tolist())
label_fr = set(dfBase[dfBase['lang']=='fra'].index.tolist())
lang_pair = set()
# get lang pair label
for r in dfLink.itertuples():
    if r[1] in label_en and r[2] in label_fr:
        lang_pair.add((r[1], r[2]))
    elif r[2] in label_en and r[1] in label_fr:
        lang_pair.add((r[2], r[1]))
lang_enfr = list(lang_pair)

In [None]:
# extract lang pair
tatoeba_en = []
tatoeba_fr = []
for lpair in lang_enfr:
    tatoeba_en.append(dfBase.loc[lpair[0],'text'].strip())
    tatoeba_fr.append(dfBase.loc[lpair[1],'text'].strip())
print('tatoeba has {} sentences pair for english and french'.format(len(tatoeba_en)))

In [None]:
sid, win = 50000, 3
for idx in range(sid, sid+win):
    print(tatoeba_en[idx], '||', tatoeba_fr[idx])

In [None]:
save_text(tatoeba_en, './data/clean/tatoeba.en')
save_text(tatoeba_fr, './data/clean/tatoeba.fr')

# jrc

In [None]:
tree = ET.parse('./data/raw/jrc/alignedCorpus-en-fr.xml')
root = tree.getroot()

In [None]:
jrc_en = []
jrc_fr = []

# extract and clean
anti_dup = set()
for t in root.findall('.//link'):
    
    # get text
    
    # s1 or s2 may contain p
    if t.find('s1').find('p') is None:
        sent_en = t.find('s1').text
    else:
        sent_en = ' '.join([p.text for p in t.find('s1').findall('p')])
    if t.find('s2').find('p') is None:
        sent_fr = t.find('s2').text
    else:
        sent_fr = ' '.join([p.text for p in t.find('s2').findall('p')])
        
    # clean
    
    # start or end space
    sent_en = sent_en.strip()
    sent_fr = sent_fr.strip()
    
    # useless line
    if sent_en.startswith('Article ') or sent_fr.startswith('Article '):
        continue
    if ('%gt%' in sent_en) or ('%gt%' in sent_fr):
        continue
    if ('http://' in sent_en) or ('http://' in sent_fr):
        continue
    if ('https://' in sent_en) or ('https://' in sent_fr):
        continue
    if sent_en.startswith('[1]') or sent_fr.startswith('[1]'):
        continue
    if (sent_en.startswith('(') and sent_en.endswith(')')):
        continue
    if (sent_fr.startswith('(') and sent_fr.endswith(')')):
        continue
    if len(sent_en.split('|')) > 3:
        continue
    if len(sent_fr.split('|')) > 3:
        continue
    
    # mismatch upper percentage
    upper_en = sum([1 for c in ''.join(sent_en.split()) if c.isupper()]) * 1.0 / len(''.join(sent_en.split()))
    upper_fr = sum([1 for c in ''.join(sent_fr.split()) if c.isupper()]) * 1.0 / len(''.join(sent_fr.split()))
    
    if abs(upper_en - upper_fr) > 0.5:
        continue
    
    # sentence index
    idx_flag = False
    for idx in range(1, 6):
        pfa = str(idx)+'.'
        pfb = str(idx)+' .'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split('.', 1)[1].strip()
            sent_fr = sent_fr.split('.', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for lidx in ['a', 'b', 'c', 'd', 'A', 'B', 'C', 'D']:
        
        pfa = '('+lidx+')'
        pfb = lidx+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for idx in range(1, 4):
        
        pfa = '('+'i'*idx+')'
        pfb = 'i'*idx+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for lidx in ['- ']:
        
        enj = sent_en.startswith(lidx)
        frj = sent_fr.startswith(lidx)
        
        if enj and frj:
            sent_en = sent_en[len(lidx):]
            sent_fr = sent_fr[len(lidx):]
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    # replace special
    sent_en = sent_en.replace('%quot%', '"')
    sent_fr = sent_fr.replace('%quot%', '"')
    
    sent_en = sent_en.replace('º', 'o')
    sent_fr = sent_fr.replace('º', 'o')
    
    
    # append
    if (sent_en not in anti_dup) and (sent_fr not in anti_dup):
        anti_dup.add(sent_en)
        anti_dup.add(sent_fr)
        
        jrc_en.append(sent_en)
        jrc_fr.append(sent_fr)

In [None]:
temp_en, temp_fr = pair_mislength(jrc_en, jrc_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
jrc_en_clean, jrc_fr_clean = temp_en, temp_fr

In [None]:
sid, win = 59000, 3
for idx in range(sid, sid+win):
    print(jrc_en_clean[idx], '||', jrc_fr_clean[idx])

In [None]:
save_text(jrc_en_clean, './data/clean/jrc.en')
save_text(jrc_fr_clean, './data/clean/jrc.fr')

# giga

In [None]:
giga_en = load_text('./data/raw/giga/giga-fren.release2.fixed.en')
giga_fr = load_text('./data/raw/giga/giga-fren.release2.fixed.fr')

In [None]:
temp_en, temp_fr = pair_deduplicate(giga_en, giga_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
giga_en_clean, giga_fr_clean = temp_en, temp_fr

# un