In [41]:
import string
import re
from pickle import dump
from unicodedata import normalize
from pickle import load
from pickle import dump
from collections import Counter
import nltk
from nltk.stem.snowball import FrenchStemmer #import the French stemming library
from nltk.corpus import stopwords #import stopwords from nltk corpus
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [42]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_sentences(doc):
    return doc.strip().split('\n')

# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return cleaned
# save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [83]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# create a frequency table for all words
def to_vocab(lines):
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
    return vocab


# Remove stop words from English
def removeStopWordsEnglish(lines):
    stopWords = set(stopwords.words('english'))
    vocab = Counter()
    for line in lines:
        wordsFiltered = []
        tokens = line.split()
        for w in tokens:
            if w not in stopWords:
                wordsFiltered.append(w)
        vocab.update(wordsFiltered)
    return vocab

def removeStopWordsFrench(lines):
    stopWords = set(stopwords.words('french'))
    vocab = Counter()
    for line in lines:
        wordsFiltered = []
        tokens = line.split()
        for w in tokens:
            if w not in stopWords:
                wordsFiltered.append(w)
        vocab.update(wordsFiltered)
    return vocab

# Remove stop words from Romanian
def removeStopWordsRomanian(lines):
    wordsFiltered = []
    stopWords = set(stopwords.words('romanian'))
    vocab = Counter()
    for line in lines:
        wordsFiltered = []
        tokens = line.split()
        for w in tokens:
            if w not in stopWords:
                wordsFiltered.append(w)
        vocab.update(wordsFiltered)
    return vocab

# remove all words with a frequency below a threshold
def trim_vocab(vocab, min_occurance):
    tokens = [k for k,c in vocab.items() if c >= min_occurance]
    return set(tokens)

# mark all OOV with "unk" for all lines
def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
            else:
                new_tokens.append('unk')
        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_lines

In [84]:
# load English data
filename = 'K:\IIT Life\Lecture Notes\SEM 2 - OSNA DAA ADM\ADM\Project Reports\Europal DS\en-ro.txt\Europarl.en-ro.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'english.pkl')

# load English dataset
filename = 'english.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))
print('English Sentences: %d' % len(lines))

# removing stop words from vocabulary
vocab = removeStopWordsEnglish(lines)
print ('### After removing the stop words ###')
print('English Vocabulary: %d' % len(vocab))
print('English Sentences: %d' % len(lines))

# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print ('### After Pruning the data ###')
print('New English Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
print('New English Sentences: %d' % len(lines))
# save updated dataset
filename = 'english_vocab.pkl'
save_clean_sentences(lines, filename)

Saved: english.pkl
English Vocabulary: 46454
English Sentences: 399042
### After removing the stop words ###
English Vocabulary: 46318
English Sentences: 399042
### After Pruning the data ###
New English Vocabulary: 19981
New English Sentences: 399042
Saved: english_vocab.pkl


In [85]:
# load Roman data
filename = 'K:\IIT Life\Lecture Notes\SEM 2 - OSNA DAA ADM\ADM\Project Reports\Europal DS\en-ro.txt\Europarl.en-ro.ro'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'roman.pkl')

# load Roman dataset
filename = 'roman.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('Roman Vocabulary: %d' % len(vocab))
print('Roman Sentences: %d' % len(lines))

# removing stop words from vocabulary
vocab = removeStopWordsRomanian(lines)
print ('### After removing the stop words ###')
print('Roman Vocabulary: %d' % len(vocab))
print('Roman Sentences: %d' % len(lines))

# reduce vocabulary
vocab = trim_vocab(vocab, 5)

print ('### After Pruning the data ###')

print('New Roman Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
print('New Roman Sentences: %d' % len(lines))
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'roman_vocab.pkl'
save_clean_sentences(lines, filename)

Saved: roman.pkl
Roman Vocabulary: 71020
Roman Sentences: 399042
### After removing the stop words ###
Roman Vocabulary: 70759
Roman Sentences: 399042
### After Pruning the data ###
New Roman Vocabulary: 30865
New Roman Sentences: 399042
Saved: roman_vocab.pkl


In [86]:
# load English data
filename = 'K:\IIT Life\Lecture Notes\SEM 2 - OSNA DAA ADM\ADM\Project Reports\Europal DS\en-fr.txt\Europarl.en-fr.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'english_french.pkl')

# load English dataset
filename = 'english_french.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))
print('English Sentences: %d' % len(lines))

# removing stop words from vocabulary
vocab = removeStopWordsEnglish(lines)
print ('### After removing the stop words ###')
print('English Vocabulary: %d' % len(vocab))
print('English Sentences: %d' % len(lines))

# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print ('### After Pruning the data ###')
print('New English Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
print('New English Sentences: %d' % len(lines))
# save updated dataset
filename = 'english_french_vocab.pkl'
save_clean_sentences(lines, filename)

Saved: english_french.pkl
English Vocabulary: 106227
English Sentences: 2049662
### After removing the stop words ###
English Vocabulary: 106080
English Sentences: 2049662
### After Pruning the data ###
New English Vocabulary: 41913
New English Sentences: 2049662
Saved: english_french_vocab.pkl


In [87]:
# load English data
filename = 'K:\IIT Life\Lecture Notes\SEM 2 - OSNA DAA ADM\ADM\Project Reports\Europal DS\en-fr.txt\Europarl.en-fr.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'french_english.pkl')

# load English dataset
filename = 'french_english.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('French Vocabulary: %d' % len(vocab))
print('French Sentences: %d' % len(lines))

# removing stop words from vocabulary
vocab = removeStopWordsFrench(lines)
print ('### After removing the stop words ###')
print('French Vocabulary: %d' % len(vocab))
print('French Sentences: %d' % len(lines))

# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print ('### After Pruning the data ###')
print('New French Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
print('New French Sentences: %d' % len(lines))
# save updated dataset
filename = 'french_english_vocab.pkl'
save_clean_sentences(lines, filename)

Saved: french_english.pkl
French Vocabulary: 142691
French Sentences: 2049662
### After removing the stop words ###
French Vocabulary: 142562
French Sentences: 2049662
### After Pruning the data ###
New French Vocabulary: 59138
New French Sentences: 2049662
Saved: french_english_vocab.pkl


In [75]:
# create a frequency table for all words
def removeStopWordsEnglish(lines):
    stopWords = set(stopwords.words('english'))
    vocab = Counter()
    for line in lines:
        wordsFiltered = []
        tokens = line.split()
        for w in tokens:
            if w not in stopWords:
                wordsFiltered.append(w)
        vocab.update(wordsFiltered)
    return vocab

In [77]:
removeStopWordsEnglish(['this is my world! Huahahaha is my','this is name pranav', 'my name is pranav makkar'])

Counter({'Huahahaha': 1, 'makkar': 1, 'name': 2, 'pranav': 2, 'world!': 1})

In [78]:
# create a frequency table for all words
def removeStopWordsFrench(lines):
    stopWords = set(stopwords.words('french'))
    vocab = Counter()
    for line in lines:
        wordsFiltered = []
        tokens = line.split()
        for w in tokens:
            if w not in stopWords:
                wordsFiltered.append(w)
        vocab.update(wordsFiltered)
    return vocab

In [79]:
removeStopWordsFrench(['Ceci est mon monde! Huahahaha est mon'])

Counter({'Ceci': 1, 'Huahahaha': 1, 'monde!': 1})

In [80]:
def removeStopWordsRomanian(lines):
    wordsFiltered = []
    stopWords = set(stopwords.words('romanian'))
    vocab = Counter()
    for line in lines:
        wordsFiltered = []
        tokens = line.split()
        for w in tokens:
            if w not in stopWords:
                wordsFiltered.append(w)
        vocab.update(wordsFiltered)
    return vocab

In [82]:
removeStopWordsRomanian(['Dle președinte, permiteți-mi să vă pun o întrebare','Numele meu este pranav Makkar','Numele este pranav Makkar'])

Counter({'Dle': 1,
         'Makkar': 2,
         'Numele': 2,
         'permiteți-mi': 1,
         'pranav': 2,
         'președinte,': 1,
         'pun': 1,
         'întrebare': 1})