In [1]:
import pandas as pd
import re
from random import randint
from collections import Counter
from tqdm import tqdm

#### regex for parsing out punctuations as separate words

In [2]:
split_condition=re.compile(r"\w+|[^\w\s]",  re.UNICODE)

#### read the raw data from file and into a dataframe

In [3]:
en_text=[]
fr_text=[]
with open('data/europarl-v7.fr-en.en','rt',encoding='utf-8') as f:
    for line in f:
        en_text.append(line)
with open('data/europarl-v7.fr-en.fr','rt',encoding='utf-8') as f:
    for line in f:
        fr_text.append(line)

In [30]:
en_text=en_text[:100]
fr_text=fr_text[:100]
with open('data/europarl-v7.fr-en_small.en','wt',encoding='utf-8') as f:
    for item in en_text:
        f.write(item)
with open('data/europarl-v7.fr-en_small.fr','wt',encoding='utf-8') as f:
    for item in fr_text:
        f.write(item)

In [4]:
df=pd.DataFrame(
{
    'English':en_text,
    'French':fr_text
}
)

In [5]:
df.shape

(2007723, 2)

#### count number of words for each row

In [6]:
def get_number_of_words(text,reexp):
    #words=reexp.findall(text)
    words=text.split(' ')
    return len(words)

In [7]:
df['English_WordCount']=df['English'].apply(lambda x:get_number_of_words(x,split_condition))
df['French_WordCount']=df['French'].apply(lambda x:get_number_of_words(x,split_condition))
df.head()

Unnamed: 0,English,French,English_WordCount,French_WordCount
0,Resumption of the session\n,Reprise de la session\n,4,4
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,38,33
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",31,37
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,19,19
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",40,38


In [8]:
i=randint(0,df.shape[0])
print(df.iloc[i]['English'])
print(split_condition.findall(df.iloc[i]['English']))
print(df.iloc[i]['French'])
print(split_condition.findall(df.iloc[i]['French']))

Mr Prodi talks solely of 'mechanisms' and 'structures' and fails to set out a single balanced project which responds to the aspirations of our people.

['Mr', 'Prodi', 'talks', 'solely', 'of', "'", 'mechanisms', "'", 'and', "'", 'structures', "'", 'and', 'fails', 'to', 'set', 'out', 'a', 'single', 'balanced', 'project', 'which', 'responds', 'to', 'the', 'aspirations', 'of', 'our', 'people', '.']
R. Prodi ne parle que de "mechanisms", de "structures", mais ne définit aucun projet équilibré et répondant aux aspirations de nos peuples.

['R', '.', 'Prodi', 'ne', 'parle', 'que', 'de', '"', 'mechanisms', '"', ',', 'de', '"', 'structures', '"', ',', 'mais', 'ne', 'définit', 'aucun', 'projet', 'équilibré', 'et', 'répondant', 'aux', 'aspirations', 'de', 'nos', 'peuples', '.']


In [9]:
threshold=100
df['E']=df['English_WordCount'].apply(lambda x:1 if x>threshold else 0)
df['F']=df['French_WordCount'].apply(lambda x:1 if x>threshold else 0)
df['Threshold']=(df['E']+df['F'])
df.head()

Unnamed: 0,English,French,English_WordCount,French_WordCount,E,F,Threshold
0,Resumption of the session\n,Reprise de la session\n,4,4,0,0,0
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,38,33,0,0,0
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",31,37,0,0,0
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,19,19,0,0,0
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",40,38,0,0,0


In [10]:
temp=df[df['Threshold']>0]
print(df.shape[0],temp.shape[0])

2007723 4815


### count words

In [11]:
english_words=Counter()
for sentence in tqdm(df['English'].tolist()):
    words=split_condition.findall(sentence.lower())
    for word in words:
        english_words[word]+=1
#english_words

100%|█████████████████████████████████████████████████████████████████████| 2007723/2007723 [00:39<00:00, 50601.94it/s]


In [12]:
french_words=Counter()
for sentence in tqdm(df['French'].tolist()):
    words=split_condition.findall(sentence.lower())
    for word in words:
        french_words[word]+=1
#french_words

100%|█████████████████████████████████████████████████████████████████████| 2007723/2007723 [00:51<00:00, 39171.58it/s]


In [13]:
len(english_words)

84413

In [14]:
len(french_words)

115410

In [19]:
limit=10
low_freq_en=[]
for word in english_words:
    if english_words[word]<=limit: low_freq_en.append(word)
low_freq_fr=[]
for word in french_words:
    if french_words[word]<=limit: low_freq_fr.append(word)

In [20]:
print(len(low_freq_en))
print(len(low_freq_fr))

55399
76039


In [21]:
len(english_words)-len(low_freq_en)

29014

In [22]:
len(french_words)-len(low_freq_fr)

39371

In [36]:
max_words=100
en_text=[]
fr_text=[]
with open('data/europarl-v7.fr-en.en','rt',encoding='utf-8') as f1,open('data/europarl-v7.fr-en.fr','rt',encoding='utf-8') as f2:
        for line_en,line_fr in zip(f1,f2):
            words_en=line_en.split(' ')
            words_fr=line_fr.split(' ')
            if len(words_en)<=max_words and len(words_fr)<=max_words:
                en_text.append(line_en)
                fr_text.append(line_fr)
with open('data/europarl-v7.fr-en_'+str(max_words)+'.en','wt',encoding='utf-8') as f:
    for item in en_text:
        f.write(item)
with open('data/europarl-v7.fr-en_'+str(max_words)+'.fr','wt',encoding='utf-8') as f:
    for item in fr_text:
        f.write(item)