In [1]:
import pandas as pd
import re
from random import randint
from collections import Counter
from tqdm import tqdm

#### regex for parsing out punctuations as separate words

In [2]:
split_condition=re.compile(r"\w+|[^\w\s]",  re.UNICODE)

#### read the raw data from file and into a dataframe

In [28]:
en_text=[]
fr_text=[]
with open('data/europarl-v7.fr-en.en','rt',encoding='utf-8') as f:
    for line in f:
        en_text.append(line)
with open('data/europarl-v7.fr-en.fr','rt',encoding='utf-8') as f:
    for line in f:
        fr_text.append(line)

In [30]:
en_text=en_text[:100]
fr_text=fr_text[:100]
with open('data/europarl-v7.fr-en_small.en','wt',encoding='utf-8') as f:
    for item in en_text:
        f.write(item)
with open('data/europarl-v7.fr-en_small.fr','wt',encoding='utf-8') as f:
    for item in fr_text:
        f.write(item)

In [4]:
df=pd.DataFrame(
{
    'English':en_text,
    'French':fr_text
}
)

In [5]:
df.shape

(2007723, 2)

#### count number of words for each row

In [23]:
def get_number_of_words(text,reexp):
    #words=reexp.findall(text)
    words=text.split(' ')
    return len(words)

In [24]:
df['English_WordCount']=df['English'].apply(lambda x:get_number_of_words(x,split_condition))
df['French_WordCount']=df['French'].apply(lambda x:get_number_of_words(x,split_condition))
df.head()

Unnamed: 0,English,French,English_WordCount,French_WordCount,E,F,Threshold
0,Resumption of the session\n,Reprise de la session\n,4,4,0,0,0
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,38,33,0,0,0
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",31,37,0,0,0
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,19,19,0,0,0
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",40,38,0,0,0


In [25]:
i=randint(0,df.shape[0])
print(df.iloc[i]['English'])
print(split_condition.findall(df.iloc[i]['English']))
print(df.iloc[i]['French'])
print(split_condition.findall(df.iloc[i]['French']))

That leaves, Mr Söderman, the massive problem of informing the public, creating public awareness, of this new institution whose work you are pioneering today.

['That', 'leaves', ',', 'Mr', 'Söderman', ',', 'the', 'massive', 'problem', 'of', 'informing', 'the', 'public', ',', 'creating', 'public', 'awareness', ',', 'of', 'this', 'new', 'institution', 'whose', 'work', 'you', 'are', 'pioneering', 'today', '.']
Reste, Monsieur le Médiateur, l'immense problème de l'information et de la sensibilisation des citoyens à cette nouvelle institution, qu'en pionnier vous incarnez aujourd'hui.

['Reste', ',', 'Monsieur', 'le', 'Médiateur', ',', 'l', "'", 'immense', 'problème', 'de', 'l', "'", 'information', 'et', 'de', 'la', 'sensibilisation', 'des', 'citoyens', 'à', 'cette', 'nouvelle', 'institution', ',', 'qu', "'", 'en', 'pionnier', 'vous', 'incarnez', 'aujourd', "'", 'hui', '.']


In [26]:
threshold=100
df['E']=df['English_WordCount'].apply(lambda x:1 if x>threshold else 0)
df['F']=df['French_WordCount'].apply(lambda x:1 if x>threshold else 0)
df['Threshold']=(df['E']+df['F'])
df.head()

Unnamed: 0,English,French,English_WordCount,French_WordCount,E,F,Threshold
0,Resumption of the session\n,Reprise de la session\n,4,4,0,0,0
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,38,33,0,0,0
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",31,37,0,0,0
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,19,19,0,0,0
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",40,38,0,0,0


In [27]:
temp=df[df['Threshold']>0]
print(df.shape[0],temp.shape[0])

2007723 4815


### count words

In [11]:
english_words=Counter()
for sentence in tqdm(df['English'].tolist()):
    words=split_condition.findall(sentence.lower())
    for word in words:
        english_words[word]+=1
#english_words

100%|█████████████████████████████████████████████████████████████████████| 2007723/2007723 [00:40<00:00, 49683.71it/s]


In [77]:
french_words=Counter()
for sentence in tqdm(df['French'].tolist()):
    words=split_condition.findall(sentence.lower())
    for word in words:
        french_words[word]+=1
#french_words

100%|█████████████████████████████████████████████████████████████████████| 2007723/2007723 [00:50<00:00, 39875.19it/s]


In [78]:
len(english_words)

84413

In [79]:
len(french_words)

115410

In [80]:
limit=1
low_freq_en=[]
for word in english_words:
    if english_words[word]<=1: low_freq_en.append(word)
low_freq_fr=[]
for word in french_words:
    if french_words[word]<=1: low_freq_fr.append(word)

In [81]:
print(len(low_freq_en))
print(len(low_freq_fr))

27689
36554


In [31]:
low_freq_en

Unnamed: 0,English,French,English_WordCount,French_WordCount,E,F,Threshold
81,"In my opinion, this second hypothesis would im...","À mon avis, cette deuxième hypothèse signifier...",105,104,1,1,2
341,"For this reason, one of the most important and...",C'est la raison pour laquelle un des objectifs...,101,133,1,1,2
516,I agree with the rapporteur that unfortunately...,"Comme le rapporteur, j'estime que le document ...",77,115,0,1,1
548,As far as I am concerned - taking into account...,En ce qui me concerne - compte tenu de l' idée...,93,105,0,1,1
553,"However, I do wish to mention - since you have...",Vous vous êtes inquiétés de ce qui pourrait ap...,109,126,1,1,2
624,"Given this situation, the report approved by P...","Face à cette situation, le rapport adopté par ...",71,120,0,1,1
733,"I wish to assure you, firstly, that the mergin...",Je veux tout d'abord vous assurer que la fusio...,115,119,1,1,2
1260,"To conclude, I would like to tell you what I a...","Pour conclure, je voudrais dire ce que j'atten...",111,117,1,1,2
1362,"What we cannot do, Commissioner - and I would ...","Ce que nous ne pouvons pas faire, Monsieur le ...",130,138,1,1,2
1864,"We could then, theoretically, also support the...",Nous pourrions alors en principe répondre même...,99,118,0,1,1


In [36]:
max_words=100
en_text=[]
fr_text=[]
with open('data/europarl-v7.fr-en.en','rt',encoding='utf-8') as f1,open('data/europarl-v7.fr-en.fr','rt',encoding='utf-8') as f2:
        for line_en,line_fr in zip(f1,f2):
            words_en=line_en.split(' ')
            words_fr=line_fr.split(' ')
            if len(words_en)<=max_words and len(words_fr)<=max_words:
                en_text.append(line_en)
                fr_text.append(line_fr)
with open('data/europarl-v7.fr-en_'+str(max_words)+'.en','wt',encoding='utf-8') as f:
    for item in en_text:
        f.write(item)
with open('data/europarl-v7.fr-en_'+str(max_words)+'.fr','wt',encoding='utf-8') as f:
    for item in fr_text:
        f.write(item)