In [2]:
import pandas as pd

# LOAD DATASET

In [23]:
def load_doc(filename):
    file=open(filename,mode='rt',encoding='utf-8')
    text=file.read()
    file.close()
    return text
def to_sentences(doc):
    return doc.strip().split('\n')
def sentence_lengths(sentences):
    lengths=[len(s.split()) for s in sentences]
    return min(lengths),max(lengths)

In [6]:
filename='europarl-v7.fr-en.en'
doc=load_doc(filename)
sentences=to_sentences(doc)

minlen,maxlen=sentence_lengths(sentences)
print('english data: sentences=%d, min=%d, max=%d'%(len(sentences),minlen,maxlen))


filename='europarl-v7.fr-en.fr'
doc=load_doc(filename)
sentences=to_sentences(doc)

minlen,maxlen=sentence_lengths(sentences)
print('french data: sentences=%d, min=%d, max=%d'%(len(sentences),minlen,maxlen))

english data: sentences=2007723, min=0, max=668
french data: sentences=2007723, min=0, max=693


# CLEAN DATASET

In [20]:
def clean_lines(lines):
    cleaned=list()
    re_print=re.compile('[^%s]' % re.escape(string.printable))#prepare regex for char filtering
    table=str.maketrans('','',string.punctuation)#prepare translation table for removing punctuation
    for lin in lines:
        line=normalize('NFD',line).encode('ascii','ignore')
        line=line.decode('UTF-8')
        lin=line.split()
        line=[word.lower() for word in line]
        line=[word.translate(table) for word in line]#remove punctuation from each token
        line=[re_print.sub('',w) for w in line]#remove non-printable characters from each token
        line=[word for word in line if word.isalpha()]#remove tokens with number in them
        cleaned.append(' '.join(line))
    return cleaned

In [26]:
#now we will save the lists of clean lines directly in binary format using pickle
import string 
import re
from pickle import dump
from unicodedata import normalize

def save_clean_sentences(sentences,filename):
    dump(sentences,open(filename,'wb'))
    print('Saved: %s'%filename)
    
filename='europarl-v7.fr-en.en'
doc=load_doc(filename)
sentences=to_sentences(doc)
save_clean_sentences(sentences,'english.pkl')
for i in range(10):
    print(sentences[i])

print('\n')

filename='europarl-v7.fr-en.fr'
doc=load_doc(filename)
sentences=to_sentences(doc)
save_clean_sentences(sentences,'french.pkl')
for i in range(10):
    print(sentences[i])

Saved: english.pkl
Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
You have requested a debate on this subject in the course of the next few days, during this part-session.
In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
Please rise, then, for this minute' s silence.
(The House rose and observed a minute' s silence)
Madam President, on a point of order.
You will be aware from the press and television that there have been a number of 

# REDUCE VOCABULARY

In [39]:

from pickle import load
from pickle import dump
from collections import Counter

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# create a frequency table for all words
def to_vocab(lines):
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
    return vocab

# remove all words with a frequency below a threshold
def trim_vocab(vocab, min_occurance):
    tokens = [k for k,c in vocab.items() if c >= min_occurance]
    return set(tokens)

# mark all OOV with "unk" for all lines
def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
            else:
                new_tokens.append('unk')
        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_lines

# load English dataset
filename = 'english.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New English Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'english_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(10):
    print(lines[i])

# load French dataset
filename = 'french.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('French Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New French Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'french_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(10):
    print(lines[i])

English Vocabulary: 301951
New English Vocabulary: 86944
Saved: english_vocab.pkl
Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
Although, as you will have seen, the dreaded unk unk failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
You have requested a debate on this subject in the course of the next few days, during this part-session.
In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.
Please rise, then, for this minute' s silence.
(The House rose and observed a minute' s silence)
Madam President, on a point of order.
You will be aware from the 