In [1]:
import xml.etree.ElementTree as ET
import spacy
import numpy as np
import pandas as pd
import fastText

# source

In [2]:
# load text from file
def load_text(file_path, encoding='UTF-8'):
    
    docs = []
    with open(file_path, 'r', encoding=encoding) as f:
        for line in f:
            docs.append(line.strip())
    
    return docs

In [3]:
# get language code by fasttext
def get_lang(docs, model_path, kmost=1):

    lang_classifier = fastText.load_model(model_path)

    return lang_classifier.predict(docs, kmost)

In [4]:
# save text to file
def save_text(text, file_path, encoding='UTF-8'):
    # write to output
    with open(file_path, 'w', encoding=encoding) as f:
        f.write('\n'.join(text))
        f.write('\n')

# filter

In [5]:
# remove duplicate pair
def pair_deduplicate(doc_left, doc_right):
    
    doc_left_clean = []
    doc_right_clean = []
    
    # already exist
    doc_contain = set()
    
    for dl, dr in zip(doc_left, doc_right):
        if (dl not in doc_contain) and (dr not in doc_contain):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        
        doc_contain.add(dl)
        doc_contain.add(dr)
        
    print('remove duplicate from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

In [6]:
# remove mislength pair
def pair_mislength(doc_left, doc_right, ratio=1.8, verbose=False):
    
    doc_left_clean = []
    doc_right_clean = []
    
    for dl, dr in zip(doc_left, doc_right):
        
        ll = len(dl.split())
        lr = len(dr.split())
        
        if (ll < lr*ratio) and (lr < ll*ratio):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        elif (ll < 8 or lr < 8) and (abs(ll-lr) < 8):
            doc_left_clean.append(dl)
            doc_right_clean.append(dr)
        elif verbose:
            print(dl, '||', dr)
        
    print('remove mislength from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

In [7]:
# remove mislang pair
def pair_mislang(doc_left, doc_right, lang_left, lang_right, model_path, kmost=2, verbose=False):
    
    res_left = get_lang(doc_left, model_path, kmost)
    res_right = get_lang(doc_right, model_path, kmost)
    
    doc_left_clean = []
    doc_right_clean = []
    
    for idx in range(len(doc_left)):
        
        rl = [l.split('__')[-1] for l in res_left[0][idx]]
        rr = [l.split('__')[-1] for l in res_right[0][idx]]
        
        if (lang_left in rl) and (lang_right in rr):
            doc_left_clean.append(doc_left[idx])
            doc_right_clean.append(doc_right[idx])
        elif verbose:
            print(doc_left[idx], '||', doc_right[idx])
    
    print('remove mislength from langauge pair')
    print('from {0} to {1}'.format(len(doc_left), len(doc_left_clean)))
    
    return doc_left_clean, doc_right_clean

# europarl

In [8]:
europarl_en = load_text('./data/raw/europarl/europarl-v7.fr-en.en')
europarl_fr = load_text('./data/raw/europarl/europarl-v7.fr-en.fr')

In [9]:
temp_en, temp_fr = pair_deduplicate(europarl_en, europarl_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
europarl_en_clean, europarl_fr_clean = temp_en, temp_fr

remove duplicate from langauge pair
from 2007723 to 1943690
remove mislength from langauge pair
from 1943690 to 1930782
remove mislength from langauge pair
from 1930782 to 1928347


In [10]:
sid, win = 45000, 3
for idx in range(sid, sid+win):
    print(europarl_en_clean[idx], '||', europarl_fr_clean[idx])

Mr President, I would like to preface my few remarks by echoing comments made by other speakers in thanking Mrs Palacio for the work that she has done on this report. || Monsieur le Président, avant de formuler mes remarques, je voudrais me faire l'écho des commentaires faits par d'autres orateurs et remercier Mme Palacio pour le travail qu'elle a accompli en rédigeant ce rapport.
It is not merely that she is not proposing any amendments - although that is an example that some rapporteurs might follow with advantage, rather it is because she has recognised the overriding importance of taking this legislation forward and putting it on the statute book. || Et ce n'est pas simplement parce qu'elle ne propose aucun amendement - bien qu'il serait avantageux pour certains rapporteurs de suivre cet exemple - mais plutôt parce qu'elle a reconnu qu'il était d'une importance primordiale de faire avancer cette législation et de l'inscrire dans les textes de loi.
As has been mentioned this evening

In [12]:
save_text(europarl_en_clean, './data/clean/europarl-enfr-clean.en')
save_text(europarl_fr_clean, './data/clean/europarl-enfr-clean.fr')

# tatoeba

In [13]:
dfBase = pd.read_csv('./data/raw/tatoeba/sentences.csv', sep='\t', header=None, names=['label', 'lang', 'text'])
dfBase = dfBase.set_index('label')
dfLink = pd.read_csv('./data/raw/tatoeba/links.csv', sep='\t', header=None, names=['ida', 'idb'])

In [14]:
# en fr sentences label
label_en = set(dfBase[dfBase['lang']=='eng'].index.tolist())
label_fr = set(dfBase[dfBase['lang']=='fra'].index.tolist())
lang_pair = set()
# get lang pair label
for r in dfLink.itertuples():
    if r[1] in label_en and r[2] in label_fr:
        lang_pair.add((r[1], r[2]))
    elif r[2] in label_en and r[1] in label_fr:
        lang_pair.add((r[2], r[1]))
lang_enfr = list(lang_pair)

In [15]:
# extract lang pair
tatoeba_en = []
tatoeba_fr = []
for lpair in lang_enfr:
    tatoeba_en.append(dfBase.loc[lpair[0],'text'].strip())
    tatoeba_fr.append(dfBase.loc[lpair[1],'text'].strip())
print('tatoeba has {} sentences pair for english and french'.format(len(tatoeba_en)))

tatoeba has 231165 sentences pair for english and french


In [16]:
sid, win = 50000, 3
for idx in range(sid, sid+win):
    print(tatoeba_en[idx], '||', tatoeba_fr[idx])

We couldn't carry out our project because of a lack of funds. || Nous n'avons pu mener à bien notre projet à cause d'un manque de fonds.
Yes, I have smoked crack cocaine. || Oui, j'ai fumé du crack.
I don't have a prejudice against foreign workers. || Je n'ai pas de préjugé contre les travailleurs étrangers.


In [17]:
save_text(tatoeba_en, './data/clean/tatoeba-enfr-clean.en')
save_text(tatoeba_fr, './data/clean/tatoeba-enfr-clean.fr')

# jrc

In [8]:
tree = ET.parse('./data/raw/jrc/alignedCorpus-en-fr.xml')
root = tree.getroot()

In [9]:
jrc_en = []
jrc_fr = []

# extract and clean
anti_dup = set()
for t in root.findall('.//link'):
    
    # get text
    
    # s1 or s2 may contain p
    if t.find('s1').find('p') is None:
        sent_en = t.find('s1').text
    else:
        sent_en = ' '.join([p.text for p in t.find('s1').findall('p')])
    if t.find('s2').find('p') is None:
        sent_fr = t.find('s2').text
    else:
        sent_fr = ' '.join([p.text for p in t.find('s2').findall('p')])
        
    # clean
    
    # start or end space
    sent_en = sent_en.strip()
    sent_fr = sent_fr.strip()
    
    # useless line
    if sent_en.startswith('Article ') or sent_fr.startswith('Article '):
        continue
    if ('%gt%' in sent_en) or ('%gt%' in sent_fr):
        continue
    if ('http://' in sent_en) or ('http://' in sent_fr):
        continue
    if ('https://' in sent_en) or ('https://' in sent_fr):
        continue
    if sent_en.startswith('[1]') or sent_fr.startswith('[1]'):
        continue
    if (sent_en.startswith('(') and sent_en.endswith(')')):
        continue
    if (sent_fr.startswith('(') and sent_fr.endswith(')')):
        continue
    if len(sent_en.split('|')) > 3:
        continue
    if len(sent_fr.split('|')) > 3:
        continue
    
    # mismatch upper percentage
    upper_en = sum([1 for c in ''.join(sent_en.split()) if c.isupper()]) * 1.0 / len(''.join(sent_en.split()))
    upper_fr = sum([1 for c in ''.join(sent_fr.split()) if c.isupper()]) * 1.0 / len(''.join(sent_fr.split()))
    
    if abs(upper_en - upper_fr) > 0.5:
        continue
    
    # sentence index
    idx_flag = False
    for idx in range(1, 6):
        pfa = str(idx)+'.'
        pfb = str(idx)+' .'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split('.', 1)[1].strip()
            sent_fr = sent_fr.split('.', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for lidx in ['a', 'b', 'c', 'd', 'A', 'B', 'C', 'D']:
        
        pfa = '('+lidx+')'
        pfb = lidx+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for idx in range(1, 4):
        
        pfa = '('+'i'*idx+')'
        pfb = 'i'*idx+')'
        enj = sent_en.startswith(pfa) or sent_en.startswith(pfb)
        frj = sent_fr.startswith(pfa) or sent_fr.startswith(pfb)
        
        if enj and frj:
            sent_en = sent_en.split(')', 1)[1].strip()
            sent_fr = sent_fr.split(')', 1)[1].strip()
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    idx_flag = False
    for lidx in ['- ']:
        
        enj = sent_en.startswith(lidx)
        frj = sent_fr.startswith(lidx)
        
        if enj and frj:
            sent_en = sent_en[len(lidx):]
            sent_fr = sent_fr[len(lidx):]
            break
        
        if enj != frj:
            idx_flag = True
            break
    if idx_flag:
        continue
    
    # replace special
    sent_en = sent_en.replace('%quot%', '"')
    sent_fr = sent_fr.replace('%quot%', '"')
    
    sent_en = sent_en.replace('º', 'o')
    sent_fr = sent_fr.replace('º', 'o')
    
    
    # append
    if (sent_en not in anti_dup) and (sent_fr not in anti_dup):
        anti_dup.add(sent_en)
        anti_dup.add(sent_fr)
        
        jrc_en.append(sent_en)
        jrc_fr.append(sent_fr)

In [10]:
temp_en, temp_fr = pair_mislength(jrc_en, jrc_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
jrc_en_clean, jrc_fr_clean = temp_en, temp_fr

remove mislength from langauge pair
from 673595 to 669919
remove mislength from langauge pair
from 669919 to 626900


In [21]:
sid, win = 59000, 3
for idx in range(sid, sid+win):
    print(jrc_en_clean[idx], '||', jrc_fr_clean[idx])

shall not issue the document referred to in Article 10 (1), final indent, of Directive 74/150/EEC in respect of a type of tractor, the operating space, access to the driving position, doors and windows of which do not comply with the provisions of this Directive; || ne peuvent plus délivrer le document prévu à l'article 10 paragraphe 1 dernier tiret de la directive 74/150/CEE pour un type de tracteur dont l'espace de manoeuvre, les facilités d'accès au poste de conduite, les portes et les fenêtres ne répondent pas aux prescriptions de la présente directive,
may refuse to grant national type-approval in respect of a type of tractor, the operating space, access to the driving position, doors and windows of which do not comply with the provisions of this Directive. || peuvent refuser la réception de portée nationale d'un type de tracteur dont l'espace de manoeuvre, les facilités d'accès au poste de conduite, les portes et les fenêtres ne répondent pas aux prescriptions de la présente dire

In [13]:
save_text(jrc_en_clean, './data/clean/jrc-enfr-clean.en')
save_text(jrc_fr_clean, './data/clean/jrc-enfr-clean.fr')

# giga

In [11]:
giga_en = load_text('./data/raw/giga/giga-fren.release2.fixed.en')
giga_fr = load_text('./data/raw/giga/giga-fren.release2.fixed.fr')

In [12]:
temp_en, temp_fr = pair_deduplicate(giga_en, giga_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
giga_en_clean, giga_fr_clean = temp_en, temp_fr

remove duplicate from langauge pair
from 22520376 to 19630451
remove mislength from langauge pair
from 19630451 to 18958975
remove mislength from langauge pair
from 18958975 to 18640137


In [20]:
sid, win = 1171100, 3
for idx in range(sid, sid+win):
    print(giga_en_clean[idx], '||', giga_fr_clean[idx])

He wants to understand how the process of this genetic re-creation can occur so quickly, and identify the cellular steps that transform it into a potentially fatal tumour. || Il veut comprendre comment le processus de cette recréation génétique peut être aussi rapide et déterminer les stades cellulaires qui conduisent à une tumeur potentiellement mortelle.
Minister Alcock highlights the recipients of 19 new health research grants worth $6.2 million for Manitoba Backgrounder on Featured Researchers - Manitoba [ Press release 2005-13 ] Peacekeepers and mental health Members of the Canadian Armed Forces are increasingly involved in peacekeeping operations, which can lead to emotional problems including depression, anxiety disorders and suicidal behaviour. || Le ministre Alcock souligne les titulaires de 19 nouvelles subventions de recherche en santé d'une valeur de 6,2 M$ pour le Manitoba Fiche d'information sur les chercheurs en vedette - Manitoba [ Communiqué 2005-13 ] Les membres des o

In [21]:
save_text(giga_en_clean, './data/clean/giga-enfr-clean.en')
save_text(giga_fr_clean, './data/clean/giga-enfr-clean.fr')

# un

In [8]:
un_en = load_text('./data/raw/un/undoc.2000.fr-en.en')
un_fr = load_text('./data/raw/un/undoc.2000.fr-en.fr')

In [9]:
temp_en, temp_fr = pair_deduplicate(un_en, un_fr)
temp_en, temp_fr = pair_mislength(temp_en, temp_fr)
temp_en, temp_fr = pair_mislang(temp_en, temp_fr, 'en', 'fr', './model/lid.176.bin', 2)
un_en_clean, un_fr_clean = temp_en, temp_fr

remove duplicate from langauge pair
from 12886831 to 9313528
remove mislength from langauge pair
from 9313528 to 9181938
remove mislength from langauge pair
from 9181938 to 9076185


In [14]:
sid, win = 271100, 3
for idx in range(sid, sid+win):
    print(un_en_clean[idx], '||', un_fr_clean[idx])

There were few established national mechanisms to implement policies and programmes for the girl child and, in some cases, coordination among responsible institutions was insufficient. || Les mécanismes nationaux nécessaires pour appliquer des politiques et programmes en faveur de la fillette sont rares et, dans certains cas, la coordination entre les entités compétentes s'est révélée insuffisante.
Adolescents continue to lack the education and service needed to enable them to deal in a positive and responsible way with their sexuality. || Les adolescents continuent d'être privés de l'instruction et des services dont ils auraient besoin pour pouvoir assumer leur sexualité de manière positive et responsable.
Since 1995, a number of issues have gained prominence and acquired new dimensions which pose additional challenges to the full and accelerated implementation of the Platform in order to realize gender equality, development and peace by Governments, intergovernmental bodies, internat

In [15]:
save_text(un_en_clean, './data/clean/un-enfr-clean.en')
save_text(un_fr_clean, './data/clean/un-enfr-clean.fr')