In [2]:
!pip install datasets pandas transformers huggingface_hub tensorflow



In [3]:
import re
import os
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from datasets import load_dataset
from huggingface_hub import login

# 1) Reproducibility

In [4]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 2) Hyperparameters



In [5]:
SEQ_LEN          = 20
VOCAB_SIZE       = 20000
EMBED_DIM        = 256
LSTM_UNITS       = 512
BATCH_SIZE       = 128
EPOCHS           = 1
TOPK_FOR_CM      = 50
MIN_CHARS        = 12
MAX_SAMPLES      = 200_000

# 3) Load dataset


In [6]:
print("Loading dataset...")
ds = load_dataset("aarohanverma/simple-daily-conversations-cleaned", split="train")

col = "data"
if col not in ds.column_names:
    raise ValueError(f"Expected a 'data' column; found {ds.column_names}")

texts = [x[col] for x in ds]

if MAX_SAMPLES is not None:
    texts = texts[:MAX_SAMPLES]

print(f"Total raw lines: {len(texts)}")

Loading dataset...
Total raw lines: 98472


# 4) Basic cleaning

In [7]:
def clean_text(s):
    if s is None:
        return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

texts = [clean_text(t) for t in texts]
texts = [t for t in texts if len(t) >= MIN_CHARS]
print(f"After filtering short lines: {len(texts)}")

After filtering short lines: 98472


# 5) Train/Val/Test split at line-level

In [8]:
train_texts, test_texts = train_test_split(texts, test_size=0.01, random_state=SEED)
train_texts, val_texts  = train_test_split(train_texts, test_size=0.01, random_state=SEED)

print(f"Split sizes → train: {len(train_texts)}, val: {len(val_texts)}, test: {len(test_texts)}")

Split sizes → train: 96512, val: 975, test: 985


# 6) Tokenize

In [17]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")
tokenizer.fit_on_texts(train_texts)

def make_sequences(lines, seq_len):
    X, y = [], []
    for line in lines:
        tokens = tokenizer.texts_to_sequences([line])[0]
        if len(tokens) < 2:
            continue
        for i in range(1, len(tokens)):
            seq = tokens[:i]
            if len(seq) > seq_len:
                seq = seq[-seq_len:]
            X.append(seq)
            y.append(tokens[i])
    X_padded = pad_sequences(X, maxlen=seq_len, padding="pre", truncating="pre")
    return np.array(X_padded, dtype=np.int32), np.array(y, dtype=np.int32)

print("Building train sequences (this may take a moment)...")
X_train, y_train = make_sequences(train_texts, SEQ_LEN)
print("Building val sequences...")
X_val,   y_val   = make_sequences(val_texts, SEQ_LEN)
print("Building test sequences...")
X_test,  y_test  = make_sequences(test_texts, SEQ_LEN)

print(f"Train samples: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(BATCH_SIZE)


Building train sequences (this may take a moment)...
Building val sequences...
Building test sequences...
Train samples: 13, Val: 0, Test: 0


# 7) Build Keras Sequential model

In [18]:
model = models.Sequential([
    tf.keras.Input(shape=(SEQ_LEN,), dtype='int32'),
    layers.Embedding(VOCAB_SIZE + 1, EMBED_DIM, mask_zero=True),
    layers.LSTM(LSTM_UNITS),
    layers.Dense(VOCAB_SIZE + 1, activation="softmax"),
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

# 8) Train

In [22]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
]

history = model.fit(
    train_dataset,
    validation_data=val_dataset if len(X_val) > 0 else None,
    epochs=EPOCHS,
    verbose=1,
    callbacks=callbacks
)

Epoch 1/8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.0000e+00 - loss: 9.9031

ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(128,), dtype=int32). Expected shape (None, 20), but input has incompatible shape (128,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(128,), dtype=int32)
  • training=False
  • mask=None
  • kwargs=<class 'inspect._empty'>

# 9) Evaluate (accuracy + perplexity)

In [None]:
print("\nEvaluating on test set...")
if len(X_test) > 0:
    test_loss, test_acc = model.evaluate(test_dataset, verbose=0)
    perplexity = math.exp(test_loss)
    print(f"Test loss: {test_loss:.4f} | Test acc: {test_acc:.4f} | Perplexity: {perplexity:.2f}")
else:
    print("No test samples available.")

# 10) Plot training curves

In [None]:
plt.figure()
plt.plot(history.history["loss"], label="train_loss")
if "val_loss" in history.history:
    plt.plot(history.history["val_loss"], label="val_loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training/Validation Loss")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure()
plt.plot(history.history["accuracy"], label="train_acc")
if "val_accuracy" in history.history:
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training/Validation Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

# 11) Confusion Matrix (top-K frequent tokens)

In [20]:
if len(y_test) > 0:
    counts = np.bincount(y_test, minlength=VOCAB_SIZE + 1)
    topk_indices = counts.argsort()[-TOPK_FOR_CM:][::-1]

    mask_topk = np.isin(y_test, topk_indices)
    X_cm = X_test[mask_topk]
    y_true_cm = y_test[mask_topk]

    y_pred_probs = model.predict(X_cm, batch_size=BATCH_SIZE, verbose=0)
    y_pred_cm = y_pred_probs.argmax(axis=1)

    index_word = {v: k for k, v in tokenizer.word_index.items()}

    labels_cm = [index_word.get(i, f"<{i}>") for i in topk_indices]

    topk_set = set(topk_indices.tolist())
    OTHER_ID = VOCAB_SIZE + 2  # To avoid conflict

    y_true_cm_mapped = []
    y_pred_cm_mapped = []
    for yt, yp in zip(y_true_cm, y_pred_cm):
        yt_m = yt if yt in topk_set else OTHER_ID
        yp_m = yp if yp in topk_set else OTHER_ID
        y_true_cm_mapped.append(yt_m)
        y_pred_cm_mapped.append(yp_m)

    labels_cm_extended = labels_cm + ["OTHER"]
    label_to_pos = {tok_id: i for i, tok_id in enumerate(topk_indices)}
    label_to_pos[OTHER_ID] = len(topk_indices)

    y_true_plot = [label_to_pos[tok] for tok in y_true_cm_mapped]
    y_pred_plot = [label_to_pos[tok] for tok in y_pred_cm_mapped]

    cm = confusion_matrix(y_true_plot, y_pred_plot, labels=list(range(len(labels_cm_extended))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels_cm_extended)
    fig, ax = plt.subplots(figsize=(10, 10))
    disp.plot(ax=ax, xticks_rotation=90, colorbar=False)
    plt.title(f"Confusion Matrix (Top-{TOPK_FOR_CM} tokens + OTHER)")
    plt.tight_layout()
    plt.show()



UnboundLocalError: cannot access local variable 'batch_outputs' where it is not associated with a value

# 12) Text generation helper

In [None]:
def generate_text(seed_text, num_tokens=30, temperature=1.0):
    text = seed_text.strip()
    for _ in range(num_tokens):
        seq = tokenizer.texts_to_sequences([text])[0][-SEQ_LEN:]
        seq = pad_sequences([seq], maxlen=SEQ_LEN, padding="pre")
        preds = model.predict(seq, verbose=0)[0]
        if temperature != 1.0:
            preds = np.log(preds + 1e-12) / max(temperature, 1e-8)
            preds = np.exp(preds) / np.sum(np.exp(preds))
        next_id = int(np.argmax(preds))
        next_word = index_word.get(next_id, None)
        if not next_word:
            break
        text += (" " + next_word)
    return text

# 13) Try asking / prompting


In [None]:
seed = "I had a really tough day at work, what should I do"
print("\n--- Generation example ---")
print(generate_text(seed, num_tokens=25, temperature=0.9))

# 14) Save model & tokenizer

In [None]:
os.makedirs("saved_model", exist_ok=True)
model.save("saved_model/convo_lm_keras.h5")
import json
with open("saved_model/tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())

print("\nSaved model to saved_model/convo_lm_keras.h5 and tokenizer to saved_model/tokenizer.json")