<a href="https://colab.research.google.com/github/salsaadityani/natural-language-processing/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**N-GRAM**

In [None]:
import re
import unicodedata
import string
import random
import nltk
from nltk.probability import ConditionalFreqDist

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
def filter(text):
    # normalize text
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
    # replace html chars with ' '
    text = re.sub('<.*?>', ' ', text)
    # remove punctuation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    # only alphabets and numerics
    text = re.sub('[^a-zA-Z]', ' ', text)
    # replace newline with space
    text = re.sub("\n", " ", text)
    # lower case
    text = text.lower()
    # split and join the words
    text = ' '.join(text.split())

    return text

In [None]:
# Tokenize remaining words and perform lemmatization
def clean(text):
    tokens = nltk.word_tokenize(text)
    wnl = nltk.stem.WordNetLemmatizer()

    output = []
    for words in tokens:
        # lemmatize words
        output.append(wnl.lemmatize(words))

    return output

In [None]:
# Make a language model using a dictionary, trigrams, and calculate word probabilities
def n_gram_model(text):
    trigrams = list(nltk.ngrams(text, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

    cfdist = ConditionalFreqDist()
    for w1, w2, w3 in trigrams:
        cfdist[(w1, w2)][w3] += 1

    # transform frequencies to probabilities
    for w1_w2 in cfdist:
        total_count = float(sum(cfdist[w1_w2].values()))
        for w3 in cfdist[w1_w2]:
            cfdist[w1_w2][w3] /= total_count

    return cfdist

In [None]:
def predict(model, user_input):
    user_input = filter(user_input)
    user_input = user_input.split()

    w1 = len(user_input) - 2
    w2 = len(user_input)
    prev_words = user_input[w1:w2]

    # display prediction from highest to lowest maximum likelihood
    prediction = sorted(dict(model[prev_words[0], prev_words[1]]), key=lambda x: dict(model[prev_words[0], prev_words[1]])[x], reverse=True)
    print("Trigram model predictions: ", prediction)

    word = []
    weight = []
    for key, prob in dict(model[prev_words[0], prev_words[1]]).items():
        word.append(key)
        weight.append(prob)
    # pick from a weighted random probability of predictions
    next_word = random.choices(word, weights=weight, k=1)
    # add predicted word to user input
    user_input.append(next_word[0])
    print(' '.join(user_input))

    ask = input("Do you want to generate another word? (type 'y' for yes or 'n' for no): ")
    if ask.lower() == 'y':
        predict(model, str(user_input))
    elif ask.lower() == 'n':
        print("done")

In [None]:
#main

file = open('alice_in_wonderland.txt', 'r')
text = ""
while True:
  line = file.readline()
  text += line
  if not line:
    break

# pre-process text
print("Filtering...")
words = filter(text)
print("Cleaning...")
words = clean(words)

# make language model
print("Making model...")
model = n_gram_model(words)

# example: "alice said to the"
print("Enter a phrase: ")
user_input = input()
predict(model, user_input)

Filtering...
Cleaning...
Making model...
Enter a phrase: 
alice said to the
Trigram model predictions:  ['project', 'other', 'jury', 'door', 'table', 'knave', 'mock', 'gryphon', 'full', 'little', 'end', 'shore', 'beginning', 'dormouse', 'game', 'queen', 'term', 'garden', 'seaside', 'general', 'cur', 'fifth', 'puppy', 'law', 'baby', 'three', 'king', 'rosetree', 'conclusion', 'cheshire', 'duchess', 'executioner', 'croquetground', 'company', 'classic', 'whiting', 'dance', 'porpoise', 'part', 'caterpillar', 'hatter', 'head', 'tart', 'waving', 'voice', 'confused', 'trademark', 'user', 'owner', 'person']
alice said to the trademark
Do you want to generate another word? (type 'y' for yes or 'n' for no): y
Trigram model predictions:  ['owner', 'license']
alice said to the trademark owner
Do you want to generate another word? (type 'y' for yes or 'n' for no): n
done


**RNN**

In [None]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
#from keras.optimizers import RMSprop, Adam
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [None]:
SEQ_LENGTH = 100

def buildmodel(VOCABULARY):
    model = Sequential()
    model.add(LSTM(256, input_shape = (SEQ_LENGTH, 1), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(256))
    model.add(Dense(VOCABULARY, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    return model

In [None]:
file = open('alice_in_wonderland.txt', encoding = 'utf8')
raw_text = file.read()    #you need to read further characters as well
raw_text = raw_text.lower()

In [None]:
bad_chars = ['#', '*', '@', '_', '\ufeff']
for i in range(len(bad_chars)):
    raw_text = raw_text.replace(bad_chars[i],"")

In [None]:
chars = sorted(list(set(raw_text)))
print(chars)

['\n', ' ', '!', '"', '$', '%', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
text_length = len(raw_text)
char_length = len(chars)
VOCABULARY = char_length
print("Text length = " + str(text_length))
print("No. of characters = " + str(char_length))

Text length = 163006
No. of characters = 54


In [None]:
char_to_int = dict((c, i) for i, c in enumerate(chars))
input_strings = []
output_strings = []

for i in range(len(raw_text) - SEQ_LENGTH):
    X_text = raw_text[i: i + SEQ_LENGTH]
    X = [char_to_int[char] for char in X_text]
    input_strings.append(X)
    Y = raw_text[i + SEQ_LENGTH]
    output_strings.append(char_to_int[Y])

length = len(input_strings)
input_strings = np.array(input_strings)
input_strings = np.reshape(input_strings, (input_strings.shape[0], input_strings.shape[1], 1))
input_strings = input_strings/float(VOCABULARY)

output_strings = np.array(output_strings)
output_strings = np_utils.to_categorical(output_strings)
print(input_strings.shape)
print(output_strings.shape)

(162906, 100, 1)
(162906, 54)


In [None]:
model = buildmodel(VOCABULARY)
filepath="saved_models/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
history = model.fit(input_strings, output_strings, epochs = 50, batch_size = 128, callbacks = callbacks_list)

filename = 'saved_models/weights-improvement-01-3.0451.hdf5'
model = buildmodel(VOCABULARY)
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

Epoch 1/50
Epoch 00001: loss improved from inf to 2.87666, saving model to saved_models/weights-improvement-01-2.8767.hdf5
Epoch 2/50
Epoch 00002: loss improved from 2.87666 to 2.59669, saving model to saved_models/weights-improvement-02-2.5967.hdf5
Epoch 3/50
Epoch 00003: loss improved from 2.59669 to 2.41659, saving model to saved_models/weights-improvement-03-2.4166.hdf5
Epoch 4/50
Epoch 00004: loss improved from 2.41659 to 2.26560, saving model to saved_models/weights-improvement-04-2.2656.hdf5
Epoch 5/50
Epoch 00005: loss improved from 2.26560 to 2.14796, saving model to saved_models/weights-improvement-05-2.1480.hdf5
Epoch 6/50
Epoch 00006: loss improved from 2.14796 to 2.05414, saving model to saved_models/weights-improvement-06-2.0541.hdf5
Epoch 7/50
Epoch 00007: loss improved from 2.05414 to 1.97685, saving model to saved_models/weights-improvement-07-1.9769.hdf5
Epoch 8/50
Epoch 00008: loss improved from 1.97685 to 1.91118, saving model to saved_models/weights-improvement-08-

In [None]:
initial_text = ' the sun did not shine, it was too wet to play, so we sat in the house all that cold, cold wet day. ' # we sat here we two and we said how we wish we had something to do.
initial_text = [char_to_int[c] for c in initial_text]

GENERATED_LENGTH = 10
test_text = initial_text
generated_text = []

int_to_char = dict((i, c) for i, c in enumerate(chars))

for i in range(10):
    X = np.reshape(test_text, (1, SEQ_LENGTH, 1))
    next_character = model.predict(X/float(VOCABULARY))
    index = np.argmax(next_character)
    generated_text.append(int_to_char[index])
    test_text.append(index)
    test_text = test_text[1:]

print(''.join(generated_text))