In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import os
import numpy as np

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
# Read and pre-process training dataset

def standardize_punctuations(text):
    replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
    for symbol, replacement in replacement_rules.items():
        text = text.replace(symbol, replacement)
    return text

def remove_stop_words(tokenized_sentence):
    filtered_sentence = []
    for word in tokenized_sentence:
        if word not in stop_words:
            filtered_sentence.append(word)
    return filtered_sentence

def get_data(input_path) :
     with open(input_path) as read_handle:
        text = read_handle.read()
        text = text.lower()
        text = standardize_punctuations(text)
        tokenized_sentences = sent_tokenize(text)
        return tokenized_sentences
     
def get_data_split(input_path) :
    tokenized_sentences = get_data(input_path)
    total_lines = len(tokenized_sentences)
    train_set = int(0.85 * total_lines)
    return tokenized_sentences[:train_set], tokenized_sentences[train_set:]
     
def tokenize_text(sentence_data, tokenized_path):
    with open(tokenized_path, 'w') as write_handle:
        for sentence in sentence_data:
            sentence = re.sub(r'[?!:.;,#@-`()]', '', sentence)
            tokenized_sentence = word_tokenize(sentence)
            tokenized_sentence = remove_stop_words(tokenized_sentence)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_sentence]
            if lemmatized_words:
                lemmatized_words = lemmatized_words +['[END]']
                write_handle.write(','.join(lemmatized_words))
                write_handle.write('\n')

def tokenize_text_unk(sentence_data, tokenized_path, vocab):
    with open(tokenized_path, 'w') as write_handle:
        for sentence in sentence_data:
            sentence = re.sub(r'[?!:.;,#@-`()]', '', sentence)
            tokenized_sentence = word_tokenize(sentence)
            tokenized_sentence = remove_stop_words(tokenized_sentence)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_sentence]
            tokenized_words = ["<UNK>" if word not in vocab else word for word in lemmatized_words]
            
            if tokenized_words:
                tokenized_words = tokenized_words + ['[END]']
                write_handle.write(','.join(tokenized_words))
                write_handle.write('\n')

def tokenize_with_threshold(sentence_data, tokenized_path, frequencies, threshold):
    with open(tokenized_path, 'w') as write_handle:
        for sentence in sentence_data:
            sentence = re.sub(r'[?!:.;,#@-`()]', '', sentence)
            tokenized_sentence = word_tokenize(sentence)
            tokenized_sentence = remove_stop_words(tokenized_sentence)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_sentence]
            tokenized_words = [word if frequencies[word] >= threshold else "<UNK>" for word in lemmatized_words]
            if tokenized_words:
                tokenized_words = tokenized_words + ['[END]']
                write_handle.write(','.join(tokenized_words))
                write_handle.write('\n')
    vocab = set(word for word, freq in frequencies.items() if freq >= threshold)
    vocab.add("<UNK>")
    return vocab