# Dataset Import/Preprocessing

In [1]:
import numpy as np
import string
import pickle
from collections import defaultdict, Counter
from spellchecker import SpellChecker

spell = SpellChecker()

In [2]:
with open("movie_lines.txt", "r", encoding='utf-8') as f:
    text = f.read()

print(text[:500])

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.	Well, I thought we'd start with pronunciation, if that's okay with you.
Well, I thought we'd start with pronunciation, if that's okay with you.	Not the hacking and gagging and spitting part.  Please.
Not the hacking and gagging and spitting part.  Please.	Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
You're asking me out.  That's so cut


In [3]:
print(len(text))

24654712


In [4]:
text = text.replace("\t", " ")
print(text[:500])

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. Well, I thought we'd start with pronunciation, if that's okay with you.
Well, I thought we'd start with pronunciation, if that's okay with you. Not the hacking and gagging and spitting part.  Please.
Not the hacking and gagging and spitting part.  Please. Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
You're asking me out.  That's so cut


In [5]:
print(len(text))

24654712


In [6]:
text = text.lower()
print(text[:500])

can we make this quick?  roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad.  again. well, i thought we'd start with pronunciation, if that's okay with you.
well, i thought we'd start with pronunciation, if that's okay with you. not the hacking and gagging and spitting part.  please.
not the hacking and gagging and spitting part.  please. okay... then how 'bout we try out some french cuisine.  saturday?  night?
you're asking me out.  that's so cut


In [7]:
for ch in string.punctuation:
    text = text.replace(ch, "")
print(text[:500])

can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again well i thought wed start with pronunciation if thats okay with you
well i thought wed start with pronunciation if thats okay with you not the hacking and gagging and spitting part  please
not the hacking and gagging and spitting part  please okay then how bout we try out some french cuisine  saturday  night
youre asking me out  thats so cute whats your name again forg


In [8]:
tokens = text.split()
print(tokens[:50])
print("Total Words: ", len(tokens))

['can', 'we', 'make', 'this', 'quick', 'roxanne', 'korrine', 'and', 'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', 'horrendous', 'public', 'break', 'up', 'on', 'the', 'quad', 'again', 'well', 'i', 'thought', 'wed', 'start', 'with', 'pronunciation', 'if', 'thats', 'okay', 'with', 'you', 'well', 'i', 'thought', 'wed', 'start', 'with', 'pronunciation', 'if', 'thats', 'okay', 'with', 'you', 'not', 'the', 'hacking', 'and']
Total Words:  4569409


In [9]:
small_tokens = tokens[:800000]

In [10]:
vocab = set(tokens)
print("Vocabulary size:", len(vocab))

Vocabulary size: 66276


# Bigram Model Creation and Saving

In [11]:
bigram_model = defaultdict(list)

In [12]:
for i in range(len(tokens) - 1):
    current_word = tokens[i]
    next_word = tokens[i + 1]
    bigram_model[current_word].append(next_word)

In [13]:
def predict_next_word(word, k=3):
    if word not in bigram_model:
        return "No Suggestion"
    next_words = bigram_model[word]
    freq = Counter(next_words)
    suggest = [w for w, _ in freq.most_common(k)]
    return suggest

In [14]:
def predict_from_sentence(sentence, k=3):
    sentence = sentence.lower()
    for ch in string.punctuation:
        sentence = sentence.replace(ch, "")
    words = sentence.split()
    if len(words) == 0:
        return "No Input"
    last_word = words[-1]
    return predict_next_word(last_word, k)

In [15]:
print(predict_from_sentence("thank"))
print(predict_from_sentence("how are"))
print(predict_from_sentence("i love"))

['you', 'god', 'me']
['you', 'we', 'the']
['you', 'with', 'to']


In [16]:
# Save Bigram Model
with open("bigram.pkl", "wb") as f:
    pickle.dump(bigram_model, f)
print("Bigram model saved.")

Bigram model saved.


# Trigram Model Creation and Saving

In [17]:
trigram_model = defaultdict(list)

In [18]:
for i in range(len(tokens) - 2):
    w1 = tokens[i]
    w2 = tokens[i + 1]
    w3 = tokens[i + 2]
    trigram_model[(w1, w2)].append(w3)

In [19]:
def predict_trigram(sentence, k=3):
    sentence = sentence.lower()

    for ch in string.punctuation:
        sentence = sentence.replace(ch, "")

    words = sentence.split()

    if len(words) < 2:
        return ["Need more words"]

    # Spell Correction Step
    corrected_words = []
    for w in words:
        if w in vocab:
            corrected_words.append(w)
        else:
            corrected_words.append(spell.correction(w))

    # Take last two words after correction
    key = (corrected_words[-2], corrected_words[-1])

    if key not in trigram_model:
        return predict_next_word(corrected_words[-1])

    freq = Counter(trigram_model[key])
    suggestions = [w for w, _ in freq.most_common(k)]

    return suggestions

In [20]:
print(predict_trigram("i love"))
print(predict_trigram("how are"))
print(predict_trigram("going to"))

['you', 'it', 'her']
['you', 'we', 'things']
['be', 'do', 'have']


In [21]:
def evaluate_trigram(sample_size=20000):
    correct = 0
    total = 0

    for i in range(sample_size):
        w1, w2, actual = tokens[i], tokens[i + 1], tokens[i + 2]
        prediction = predict_trigram(f"{w1} {w2}", k=1)
        if prediction[0] == actual:
            correct += 1
        total += 1

    return correct / total

In [22]:
acc = evaluate_trigram()
print("Accuracy:", acc)

Accuracy: 0.4065


In [23]:
print(predict_trigram("I lvoe"))

['you', 'it', 'her']


In [24]:
# Save Trigram Model
with open("trigram.pkl", "wb") as f:
    pickle.dump(trigram_model, f)
print("Trigram model saved.")

Trigram model saved.


# LSTM_2 (Predict Next Word Using 2 Words)

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [26]:
# Load tokenizer if exists, else create new
try:
    with open("tokenizer.pkl", "rb") as f:
        tokenizer = pickle.load(f)
except FileNotFoundError:
    tokenizer = Tokenizer()

tokenizer.fit_on_texts(small_tokens)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
index_word = {v: k for k, v in word_index.items()}
print("Vocab size:", vocab_size)

Vocab size: 23217


In [27]:
# Build X, y using 2-word input windows
X2 = []
y2 = []

for i in range(len(small_tokens) - 2):
    w1 = small_tokens[i]
    w2 = small_tokens[i + 1]
    w3 = small_tokens[i + 2]
    if w1 in word_index and w2 in word_index and w3 in word_index:
        X2.append([word_index[w1], word_index[w2]])
        y2.append(word_index[w3])

X2 = np.array(X2)
y2 = np.array(y2)
print("X2 shape:", X2.shape)

X2 shape: (799998, 2)


In [28]:
# Build LSTM_2 model (2 input words)
model_2 = Sequential()
model_2.add(Embedding(input_dim=vocab_size, output_dim=50))
model_2.add(LSTM(100))
model_2.add(Dense(vocab_size, activation='softmax'))
model_2.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model_2.summary()

In [None]:
X2_train = X2[:800000]
y2_train = y2[:800000]

history_2 = model_2.fit(
    X2_train,
    y2_train,
    epochs=5,
    batch_size=1024
)

Epoch 1/5
[1m326/782[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1:48[0m 237ms/step - loss: 8.2115

In [None]:
def predict_lstm_2(sentence, k=3):
    sentence = sentence.lower()
    for ch in string.punctuation:
        sentence = sentence.replace(ch, "")
    words = sentence.split()
    if len(words) < 2:
        return "Need at least 2 words"
    w1, w2 = words[-2], words[-1]
    if w1 not in word_index or w2 not in word_index:
        return "Unknown word"
    x = np.array([[word_index[w1], word_index[w2]]])
    prediction = model_2.predict(x, verbose=0)
    top_ids = np.argsort(prediction[0])[-k:][::-1]
    return [index_word.get(i, "Unknown") for i in top_ids]

In [None]:
print(predict_lstm_2("i love"))
print(predict_lstm_2("how are"))
print(predict_lstm_2("where are"))

In [None]:
# Save LSTM_2 model and tokenizer
model_2.save("lstm_2word.keras")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("LSTM_2 model saved.")

# LSTM_3 (Predict Next Word Using 3 Words)

In [None]:
# Build X, y using 3-word input windows
X = []
y = []

for i in range(len(small_tokens) - 3):
    w1 = small_tokens[i]
    w2 = small_tokens[i + 1]
    w3 = small_tokens[i + 2]
    w4 = small_tokens[i + 3]
    if w1 in word_index and w2 in word_index and w3 in word_index and w4 in word_index:
        X.append([word_index[w1], word_index[w2], word_index[w3]])
        y.append(word_index[w4])

X = np.array(X)
y = np.array(y)
print("X shape:", X.shape)

In [None]:
# Build LSTM_3 model (3 input words)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
X_train = X[:800000]
y_train = y[:800000]

history = model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=1024
)

In [None]:
def predict_lstm_3(sentence, k=3):
    sentence = sentence.lower()
    for ch in string.punctuation:
        sentence = sentence.replace(ch, "")
    words = sentence.split()
    if len(words) < 3:
        return "Need at least 3 words"
    w1, w2, w3 = words[-3], words[-2], words[-1]
    if w1 not in word_index or w2 not in word_index or w3 not in word_index:
        return "Unknown word"
    x = np.array([[word_index[w1], word_index[w2], word_index[w3]]])
    prediction = model.predict(x, verbose=0)
    top_ids = np.argsort(prediction[0])[-k:][::-1]
    return [index_word.get(i, "Unknown") for i in top_ids]

In [None]:
print(predict_lstm_3("i love how"))
print(predict_lstm_3("how are we"))
print(predict_lstm_3("where are you going"))

In [None]:
# Save LSTM_3 model and tokenizer
model.save("lstm_3word.keras")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("LSTM_3 model saved.")