### Common preprocessing steps for language models

In [1]:
#Import the libraries
import nltk
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thakk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Lowercase

In [2]:
#Change the corpus to lowercase
corpus = "Learning% makes 'me' happy. I am happy be-cause I am learning! :)"
corpus_lower = corpus.lower()

print(corpus_lower)

learning% makes 'me' happy. i am happy be-cause i am learning! :)


#### Remove the special characters

In [4]:
#Remove all the special character in the corpus
corpus = "learning% makes 'me' happy. i am happy be-cause i am learning! :)"
corpus_clean = re.sub(r"[^A-Za-z0-9.?! ]+", "", corpus)

print(corpus_clean)

learning makes me happy. i am happy because i am learning! 


#### Text Splitting

In [8]:
#Get the date
input_date = "Sat May  9 07:33:35 CEST 2020"

#Split in date format
date_ = input_date.split(' ')
print(f"date parts = {date_}")

#Split in time format
time_ = date_[4].split(':')
print(f"time parts = {time_}")

date parts = ['Sat', 'May', '', '9', '07:33:35', 'CEST', '2020']
time parts = ['07', '33', '35']


#### Sentence Tokenization

In [13]:
#Tokenize the sentence into arrays of words
sentence = 'i am happy because i am learning.'
tokenized_sentence = nltk.word_tokenize(sentence)
print(tokenized_sentence)

#Find the length of each word
word_length =  [(word, len(word)) for word in tokenized_sentence]
print(word_length)

['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
[('i', 1), ('am', 2), ('happy', 5), ('because', 7), ('i', 1), ('am', 2), ('learning', 8), ('.', 1)]


#### Sentence to N-gram

In [15]:
#Define a function to covert the tokenized sentence into list of n-grams
def sentence_to_ngram(tokenized_sentence, n):
    l = len(tokenized_sentence)
    n_grams = []
    for i in range(l - n + 1):
        n_grams += tokenized_sentence[i: i + n],
    return n_grams
tokenized_sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

print(f'List all trigrams of sentence: {tokenized_sentence}\n')
n_gram = sentence_to_ngram(tokenized_sentence, 3)
print(n_gram)

List all trigrams of sentence: ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

[['i', 'am', 'happy'], ['am', 'happy', 'because'], ['happy', 'because', 'i'], ['because', 'i', 'am'], ['i', 'am', 'learning'], ['am', 'learning', '.']]


In [16]:
#Append start and end tags to a sentence
start_tag = '<s>'
end_tag = '</s>'

def add_tags(tokenized_sentence, n):
    tokenized_sentence = [start_tag] * (n - 1) + tokenized_sentence + [end_tag] 
    return tokenized_sentence

tokenized_sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
tokenized_sentence_with_tags = add_tags(tokenized_sentence, 3)
print(tokenized_sentence_with_tags)

['<s>', '<s>', 'i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.', '</s>']
