In [20]:
import xml.etree.ElementTree as ET
import spacy
import numpy as np
import pandas as pd
import fasttext
import random
import glob

# source

In [21]:
# load text from file
def load_text(file_path, encoding='UTF-8'):
    
    docs = []
    with open(file_path, 'r', encoding=encoding) as f:
        for line in f:
            docs.append(line.strip())
    
    return docs

In [22]:
# get language code by fasttext
def get_lang(docs, model_path, kmost=1):

    lang_classifier = fasttext.load_model(model_path)

    return lang_classifier.predict(docs, kmost)

In [4]:
# save text to file
def save_text(text, file_path, encoding='UTF-8'):
    # write to output
    with open(file_path, 'w', encoding=encoding) as f:
        f.write('\n'.join(text))
        f.write('\n')

# filter

In [5]:
# remove duplicate pair
def pair_deduplicate(doc_left, doc_right):
    
    doc_left_clean = []
    doc_right_clean = []
    
    # already exist
    doc_contain = set()
    
    for dl, dr in zip(doc_left, doc_right):
        if (dl not in doc_contain) and (dr not in doc_contain):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        
        doc_contain.add(dl)
        doc_contain.add(dr)
        
    print('remove duplicate from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

In [6]:
# remove oversize pair
def pair_oversize(doc_left, doc_right, th_num=120):
    
    doc_left_clean = []
    doc_right_clean = []
    
    for dl, dr in zip(doc_left, doc_right):
        
        ll = len(dl.split())
        lr = len(dr.split())
        
        if (ll < th_num) and (lr < th_num):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        
    print('remove oversize pair from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

In [7]:
# remove mislength pair
def pair_mislength(doc_left, doc_right, ratio=1.8, verbose=False):
    
    doc_left_clean = []
    doc_right_clean = []
    
    doc_verbose = []
    
    for dl, dr in zip(doc_left, doc_right):
        
        ll = len(dl.split())
        lr = len(dr.split())
        
        if (ll < lr*ratio) and (lr < ll*ratio):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        elif (ll < 8 or lr < 8) and (abs(ll-lr) < 8):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        elif verbose:
            doc_verbose.append(dl+' || '+dr)
        
    print('remove mislength from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    if verbose:
        print('removed mislength sentences')
        for idx in random.sample(range(0,len(doc_verbose)), 25):
            print(doc_verbose[idx])
    
    return doc_left_clean, doc_right_clean

In [8]:
# remove mislang pair
def pair_mislang(doc_left, doc_right, lang_left, lang_right, model_path, kmost=2, verbose=False):
    
    res_left = get_lang(doc_left, model_path, kmost)
    res_right = get_lang(doc_right, model_path, kmost)
    
    doc_left_clean = []
    doc_right_clean = []
    
    doc_verbose = []
    
    for idx in range(len(doc_left)):
        
        rl = [l.split('__')[-1] for l in res_left[0][idx]]
        rr = [l.split('__')[-1] for l in res_right[0][idx]]
        
        if (lang_left in rl) and (lang_right in rr):
            doc_left_clean.append(doc_left[idx])
            doc_right_clean.append(doc_right[idx])
        elif verbose:
            doc_verbose.append(doc_left[idx]+' || '+doc_right[idx])
    
    print('remove mislength from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    if verbose:
        print('removed mislength sentences')
        for idx in random.sample(range(0,len(doc_verbose)), 25):
            print(doc_verbose[idx])
    
    return doc_left_clean, doc_right_clean

# brute force

In [9]:
all_en = load_text('./data/raw/bruteforce/raw-enfr.en')
all_fr = load_text('./data/raw/bruteforce/raw-enfr.fr')

In [10]:
all_en, all_fr = pair_deduplicate(all_en, all_fr)

remove duplicate from langauge pair
from 41731691 to 33712468


In [11]:
all_en, all_fr = pair_oversize(all_en, all_fr)

remove oversize pair from langauge pair
from 33712468 to 33424814


In [12]:
all_en, all_fr = pair_mislength(all_en, all_fr)

remove mislength from langauge pair
from 33424814 to 28577869


In [17]:
all_en, all_fr = pair_mislang(all_en, all_fr, 'en', 'fr', './model/lid.176.bin')

Exception: fastText: Cannot load ./model/lid.176.bin due to C++ extension failed to allocate the memory

In [18]:
win = 3
sid = random.randint(0, len(europarl_en_clean)-win-1)
for idx in range(sid, sid+win):
    print(europarl_en_clean[idx], '||', europarl_fr_clean[idx])

I am grateful to all Members who contributed their ideas, and I am startled by the degree of consensus that exists. || J'exprime ma reconnaissance à tous les membres qui y ont collaboré par leurs idées, et je suis surpris par le niveau de consensus existant.
There is an appetite in this Chamber to work fast, to work together, in search of a lasting solution to Europe's energy crisis, and we must harness that. || Il y a dans cette Assemblée un goût pour le travail rapide, le travail en commun à la recherche d'une solution durable à la crise énergétique de l'Europe, et nous devons l'exploiter.
Of all the potential plans to open up a new energy era, one stands out: it is called the supergrid, or DESERTEC. || Parmi tous les plans susceptibles d'ouvrir une nouvelle ère énergétique, il en est un qui se démarque des autres: il est appelé le super-réseau, ou DESERTEC.


In [19]:
save_text(europarl_en_clean, './data/clean/europarl-enfr-clean.en')
save_text(europarl_fr_clean, './data/clean/europarl-enfr-clean.fr')

# tatoeba

In [20]:
dfBase = pd.read_csv('./data/raw/tatoeba/sentences.csv', sep='\t', header=None, names=['label', 'lang', 'text'])
dfBase = dfBase.set_index('label')
dfLink = pd.read_csv('./data/raw/tatoeba/links.csv', sep='\t', header=None, names=['ida', 'idb'])

In [21]:
# en fr sentences label
label_en = set(dfBase[dfBase['lang']=='eng'].index.tolist())
label_fr = set(dfBase[dfBase['lang']=='fra'].index.tolist())
lang_pair = set()
# get lang pair label
for r in dfLink.itertuples():
    if r[1] in label_en and r[2] in label_fr:
        lang_pair.add((r[1], r[2]))
    elif r[2] in label_en and r[1] in label_fr:
        lang_pair.add((r[2], r[1]))
lang_enfr = list(lang_pair)

In [22]:
# extract lang pair
tatoeba_en = []
tatoeba_fr = []
for lpair in lang_enfr:
    tatoeba_en.append(dfBase.loc[lpair[0],'text'].strip())
    tatoeba_fr.append(dfBase.loc[lpair[1],'text'].strip())
print('tatoeba has {} sentences pair for english and french'.format(len(tatoeba_en)))

tatoeba has 231165 sentences pair for english and french


In [32]:
win = 3
sid = random.randint(0, len(tatoeba_en)-win-1)
for idx in range(sid, sid+win):
    print(tatoeba_en[idx], '||', tatoeba_fr[idx])

I'm not talking to you. || Je ne parle pas avec toi.
I just want you to have it. || Je veux juste que tu l'aies.
The godmother baked a delicious cake. || La marraine cuisit un délicieux gâteau.


In [33]:
save_text(tatoeba_en, './data/clean/tatoeba-enfr-clean.en')
save_text(tatoeba_fr, './data/clean/tatoeba-enfr-clean.fr')

# jrc

In [34]:
tree = ET.parse('./data/raw/jrc/alignedCorpus-en-fr.xml')
root = tree.getroot()

In [219]:
jrc_en = []
jrc_fr = []

# extract and clean
anti_dup = set()
for t in root.findall('.//link'):
    
    # get text
    
    # s1 or s2 may contain p
    if t.find('s1').find('p') is None:
        sent_en = t.find('s1').text
    else:
        sent_en = ' '.join([p.text for p in t.find('s1').findall('p')])
    if t.find('s2').find('p') is None:
        sent_fr = t.find('s2').text
    else:
        sent_fr = ' '.join([p.text for p in t.find('s2').findall('p')])
        
    # clean
    
    # start or end space
    sent_en = sent_en.strip()
    sent_fr = sent_fr.strip()
    
    # useless line
    if sent_en.startswith('Article ') or sent_fr.startswith('Article '):
        continue
    if ('%gt%' in sent_en) or ('%gt%' in sent_fr):
        continue
    if ('http://' in sent_en) or ('http://' in sent_fr):
        continue
    if ('https://' in sent_en) or ('https://' in sent_fr):
        continue
    if sent_en.startswith('[1]') or sent_fr.startswith('[1]'):
        continue
    if (sent_en.startswith('(') and sent_en.endswith(')')):
        continue
    if (sent_fr.startswith('(') and sent_fr.endswith(')')):
        continue
    if len(sent_en.split('|')) > 2:
        continue
    if len(sent_fr.split('|')) > 2:
        continue
    if len(sent_en.split('/')) > 2:
        continue
    if len(sent_en.split('/')) > 2:
        continue
    
    # mismatch upper percentage
    upper_en = sum([1 for c in ''.join(sent_en.split()) if c.isupper()]) * 1.0 / len(''.join(sent_en.split()))
    upper_fr = sum([1 for c in ''.join(sent_fr.split()) if c.isupper()]) * 1.0 / len(''.join(sent_fr.split()))
    
    if abs(upper_en - upper_fr) > 0.5:
        continue
    
    # sentence index
    idx_flag = False
    for idx in range(1, 100):
        pfa = str(idx)+'. '
        pfb = str(idx)+' . '
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split('.', 1)[1].strip()
            sent_fr = sent_fr.split('.', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    # sentence index
    idx_flag = False
    for idx in range(1, 200):
        pfa = str(idx)+'. '
        pfb = str(idx)+'. '
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split('.', 1)[1].strip()
            sent_fr = sent_fr.split('.', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for idx in range(1, 200):
        
        pfa = '('+str(idx)+')'
        pfb = str(idx)+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for lidx in ['a', 'b', 'c', 'd', 'e', 'f', 'g',  
                 'A', 'B', 'C', 'D', 'E', 'F', 'G']:
        
        pfa = '('+lidx+')'
        pfb = lidx+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for idx in range(1, 4):
        
        pfa = '('+'i'*idx+')'
        pfb = 'i'*idx+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for lidx in ['- ']:
        
        enj = sent_en.startswith(lidx)
        frj = sent_fr.startswith(lidx)
        
        if enj and frj:
            sent_en = sent_en[len(lidx):]
            sent_fr = sent_fr[len(lidx):]
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    # replace special
    sent_en = sent_en.replace('%quot%', '"')
    sent_fr = sent_fr.replace('%quot%', '"')
    
    sent_en = sent_en.replace('º', 'o')
    sent_fr = sent_fr.replace('º', 'o')
    
    
    # append
    if (sent_en not in anti_dup) and (sent_fr not in anti_dup):
        anti_dup.add(sent_en)
        anti_dup.add(sent_fr)
        
        jrc_en.append(sent_en)
        jrc_fr.append(sent_fr)

In [220]:
temp_en, temp_fr = pair_mislength(jrc_en, jrc_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 1)
jrc_en_clean, jrc_fr_clean = temp_en, temp_fr

remove mislength from langauge pair
from 579601 to 576223
remove mislength from langauge pair
from 576223 to 523722


In [316]:
win = 3
sid = random.randint(0, len(jrc_en_clean)-win-1)
for idx in range(sid, sid+win):
    print(jrc_en_clean[idx], '||', jrc_fr_clean[idx])

4.31. The Commission is of the opinion that it would not be appropriate in the AAR to distinguish, as the Court suggests, between expenditure which has already been verified and expenditure which will be subject to further verification. Given the multi-annual nature of the conformity clearance process, such a distinction would be largely arbitrary; it would also prejudice the Member States' right of defence in the context of the clearance of accounts procedure. || 4.31. Contrairement à ce que suggère la Cour, la Commission considère comme inapproprié le fait d'établir une distinction dans le RAA entre les dépenses déjà contrôlées et celles faisant l'objet de vérifications supplémentaires. Compte tenu du caractère pluriannuel de la procédure d'apurement de conformité, une telle distinction serait en grande partie arbitraire et porterait également atteinte au droit de réponse des États membres dans le cadre de la procédure d'apurement des comptes.
The Director-General’s declaration and t

In [317]:
save_text(jrc_en_clean, './data/clean/jrc-enfr-clean.en')
save_text(jrc_fr_clean, './data/clean/jrc-enfr-clean.fr')

# giga

In [318]:
giga_en = load_text('./data/raw/giga/giga-fren.release2.fixed.en')
giga_fr = load_text('./data/raw/giga/giga-fren.release2.fixed.fr')

In [319]:
temp_en, temp_fr = pair_deduplicate(giga_en, giga_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr, ratio=1.5)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 1)
giga_en_clean, giga_fr_clean = temp_en, temp_fr

remove duplicate from langauge pair
from 22520376 to 19630451
remove mislength from langauge pair
from 19630451 to 17333151
remove mislength from langauge pair
from 17333151 to 16722019


In [363]:
win = 3
sid = random.randint(0, len(giga_en_clean)-win-1)
for idx in range(sid, sid+win):
    print(giga_en_clean[idx], '||', giga_fr_clean[idx])

Additional details on submitting comments and the Government’s consultation process can be found on the Department of Finance Web site at www.fin.gc.ca. Finally, as committed to in the Government’s response to the Standing Committee on Human Resources Development and the Status of Persons with Disabilities, in 2003 the Department of Finance will initiate an evaluation of the DTC to ensure that it achieves its policy intent of providing tax assistance to persons with a severe and prolonged mental or physical impairment, the effects of which markedly restrict the ability to perform a basic activity of daily living. || On peut trouver d’autres précisions sur la présentation de commentaires et sur le processus de consultation du gouvernement sur le site Web du ministère des Finances à l’adresse www.fin.gc.ca. Enfin, conformément à la réponse du gouvernement au Comité permanent du développement des ressources humaines et de la condition des personnes handicapées, le ministère des Finances e

In [364]:
save_text(giga_en_clean, './data/clean/giga-enfr-clean.en')
save_text(giga_fr_clean, './data/clean/giga-enfr-clean.fr')

# un

In [365]:
un_en = load_text('./data/raw/un/undoc.2000.fr-en.en')
un_fr = load_text('./data/raw/un/undoc.2000.fr-en.fr')

In [366]:
temp_en, temp_fr = pair_deduplicate(un_en, un_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr, ratio=1.5)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 1)
un_en_clean, un_fr_clean = temp_en, temp_fr

remove duplicate from langauge pair
from 12886831 to 9313528
remove mislength from langauge pair
from 9313528 to 8658767
remove mislength from langauge pair
from 8658767 to 8481341


In [407]:
win = 3
sid = random.randint(0, len(un_en_clean)-win-1)
for idx in range(sid, sid+win):
    print(un_en_clean[idx], '||', un_fr_clean[idx])

One or more creditors that are owed a matured debt, in which case the creditor(s) should show that the debt has matured and is unpaid.” || Un ou plusieurs créanciers détenant une créance échue, auquel cas il(s) devrai(en)t démontrer que cette créance est échue et impayée.”
It was noted that that proposal was intended to establish minimum agreed entry criteria, and that the draft Guide could note and discuss potential variations, such as a requirement for a minimum amount of debt or that the debt need not be mature. || Il a été noté que cette proposition visait à établir des critères d'admissibilité convenus minimaux et que le projet de guide pourrait indiquer et examiner des variantes possibles, par exemple qu'il faut un montant minimum de créances ou qu'il n'est pas nécessaire que la créance soit échue.
While there was agreement for taking that general approach, some support was expressed in favour of the test being that the debtor “is unable or will be unable to pay its debts as and 

In [408]:
save_text(un_en_clean, './data/clean/un-enfr-clean.en')
save_text(un_fr_clean, './data/clean/un-enfr-clean.fr')

# all

In [15]:
all_en = []
all_fr = []
for pref_path in list(set([s.rsplit('.', 1)[0] for s in glob.glob('./data/clean/*')])):
    all_en += load_text(pref_path+'.en')
    all_fr += load_text(pref_path+'.fr')

In [86]:
win = 3
sid = random.randint(0, len(all_en)-win-1)
for idx in range(sid, sid+win):
    print(all_en[idx], '||', all_fr[idx])

Without appropriate policy frameworks to address the sector issues, and without having a national energy strategy for Canada, regulations appear to be headed for even greater complexities. || Sans les politiques cadres permettant de s’attaquer aux enjeux du secteur et sans une stratégie énergétique nationale, la réglementation semble devoir se complexifier encore davantage.
• Access to capital, and private sector investment in the energy sector, is also considered an issue by the industry. || • L’industrie considère également que l’accès au capital ainsi que l’investissement privé dans le secteur énergétique sont un enjeu.
• There is a continuing need for government funding for R&D in the energy sector, especially to address the need for developing and adopting SD tools, practices and technologies applicable to the sector. || • Ce secteur a continuellement besoin de fonds gouvernementaux pour sa R­D, particulièrement pour répondre aux besoins d’élaborer et d’adopter des outils, pratiqu

In [87]:
temp_en, temp_fr = pair_deduplicate(all_en, all_fr)
all_en_clean, all_fr_clean = pair_oversize(temp_en, temp_fr, th_num=80)

remove duplicate from langauge pair
from 27884964 to 27741249
remove oversize pair from langauge pair
from 27741249 to 26922456
