In [47]:
import pandas as pd
import re
from random import randint
from collections import Counter
from tqdm import tqdm

#### regex for parsing out punctuations as separate words

In [48]:
split_condition=re.compile(r"\w+|[^\w\s]",  re.UNICODE)

#### read the raw data from file and into a dataframe

In [49]:
en_text=[]
fr_text=[]
with open('data/europarl-v7.fr-en.en','rt',encoding='utf-8') as f:
    for line in f:
        en_text.append(line)
with open('data/europarl-v7.fr-en.fr','rt',encoding='utf-8') as f:
    for line in f:
        fr_text.append(line)

In [50]:
df=pd.DataFrame(
{
    'English':en_text,
    'French':fr_text
}
)

In [51]:
df.shape

(2007723, 2)

#### count number of words for each row

In [55]:
def get_number_of_words(text,reexp):
    words=reexp.findall(text)
    return len(words)

In [56]:
df['English_WordCount']=df['English'].apply(lambda x:get_number_of_words(x,split_condition))
df['French_WordCount']=df['French'].apply(lambda x:get_number_of_words(x,split_condition))
df.head()

Unnamed: 0,English,French,English_WordCount,French_WordCount
0,Resumption of the session\n,Reprise de la session\n,4,4
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,40,34
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",37,49
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,23,21
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",47,48


In [54]:
i=randint(0,df.shape[0])
print(df.iloc[i]['English'])
print(split_condition.findall(df.iloc[i]['English']))
print(df.iloc[i]['French'])
print(split_condition.findall(df.iloc[i]['French']))

The bull contributes to the maintenance of pastures, it contributes to the protection of the environment and it contributes to rural development.

['The', 'bull', 'contributes', 'to', 'the', 'maintenance', 'of', 'pastures', ',', 'it', 'contributes', 'to', 'the', 'protection', 'of', 'the', 'environment', 'and', 'it', 'contributes', 'to', 'rural', 'development', '.']
Le taureau contribue à l'entretien des pâturages, à la protection de l'environnement et au développement rural.

['Le', 'taureau', 'contribue', 'à', 'l', "'", 'entretien', 'des', 'pâturages', ',', 'à', 'la', 'protection', 'de', 'l', "'", 'environnement', 'et', 'au', 'développement', 'rural', '.']


In [40]:
threshold=200
df['E']=df['English_WordCount'].apply(lambda x:1 if x>threshold else 0)
df['F']=df['French_WordCount'].apply(lambda x:1 if x>threshold else 0)
df['Threshold']=(df['E']+df['F'])
df.head()

Unnamed: 0,English,French,English_WordCount,French_WordCount,E,F,Threshold
0,Resumption of the session\n,Reprise de la session\n,4,4,0,0,0
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,40,34,0,0,0
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",37,49,0,0,0
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,23,21,0,0,0
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",47,48,0,0,0


In [41]:
temp=df[df['Threshold']>0]
print(df.shape[0],temp.shape[0])

2007723 351


### count words

In [76]:
english_words=Counter()
for sentence in tqdm(df['English'].tolist()):
    words=split_condition.findall(sentence.lower())
    for word in words:
        english_words[word]+=1
#english_words

100%|█████████████████████████████████████████████████████████████████████| 2007723/2007723 [00:39<00:00, 51392.61it/s]


In [77]:
french_words=Counter()
for sentence in tqdm(df['French'].tolist()):
    words=split_condition.findall(sentence.lower())
    for word in words:
        french_words[word]+=1
#french_words

100%|█████████████████████████████████████████████████████████████████████| 2007723/2007723 [00:50<00:00, 39875.19it/s]


In [78]:
len(english_words)

84413

In [79]:
len(french_words)

115410

In [80]:
limit=1
low_freq_en=[]
for word in english_words:
    if english_words[word]<=1: low_freq_en.append(word)
low_freq_fr=[]
for word in french_words:
    if french_words[word]<=1: low_freq_fr.append(word)

In [81]:
print(len(low_freq_en))
print(len(low_freq_fr))

27689
36554


In [83]:
low_freq_en

['kumar',
 'ponnambalam',
 'hicks',
 '9002',
 '40ºc',
 'characterisations',
 'infrastrucutres',
 'chapeau',
 'ventilators',
 'ideologue',
 'preussag',
 'gütesiegel',
 'xviiith',
 'eieck',
 'factortame',
 'vivienne',
 'forestier',
 'grönitz',
 'kartellamt',
 'oarsmen',
 'auctoritas',
 'parcours',
 'jettons',
 'bébé',
 'calvinists',
 'litigable',
 'corruptibility',
 'howitts',
 'guillemots',
 'noirmoutier',
 'javette',
 'besque',
 'marée',
 'morbihan',
 'floodtide',
 'gino',
 'tanio',
 'wilhelmshaven',
 'plimsoll',
 'neuwerk',
 'ecemis',
 'cadou',
 'btf',
 'betokens',
 'landis',
 'barnhill',
 'middelhoek',
 'innately',
 'coattails',
 'maroon',
 'godparent',
 'dunams',
 'ballasts',
 'shara',
 'megalomaniacs',
 'siècle',
 'nobodies',
 'frenzies',
 'veux',
 'quelque',
 'tangents',
 'solicitous',
 'consorted',
 'teratogenicity',
 'patriotisms',
 'exacts',
 'elst',
 'référendaire',
 'kinsmen',
 'instatement',
 'graa',
 'realité',
 'phew',
 'whack',
 'pharmaceutiques',
 'scientifiques',
 '5713