In [1]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda
from tensorflow.keras.models import Model
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import kagglehub
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Lambda
import numpy as np
import shutil
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_fpath = '../data/processed/IMDB_reviews_train_cleaned.json'
test_fpath = '../data/processed/IMDB_reviews_test.json'

In [3]:
train = pd.read_json(train_fpath)
test = pd.read_json(test_fpath)

In [4]:
train.head(5)

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating_x,review_summary,plot_summary,duration,genre,...,release_date,plot_synopsis,review_year,review_month,review_day,release_year,release_month,release_day,polarity,subjectivity
407943,1156377600000,tt0463985,ur10173727,False,i saw the i not impressed not a but the car ar...,5,not such a good movie,An American teenager named Sean Boswell is a l...,104,"[Action, Crime, Thriller]",...,1150416000000,"In rural Arizona, a young, 17-year-old, rednec...",2006,8,24,2006,6,16,0.446429,0.619444
171849,1430870400000,tt0790724,ur0453068,False,jack reacher solid action set piece when they ...,6,"good stuff, except...","In an innocent heartland city, five are shot d...",130,"[Action, Crime, Mystery]",...,1356048000000,"In Pittsburgh, Pennsylvania, a man drives a va...",2015,5,6,2012,12,21,0.195402,0.588697
557247,1132876800000,tt0160862,ur7395261,True,i saw not another teen watching although did n...,10,A good movie to pass the time,She's All That is your typical high school pro...,95,"[Comedy, Romance]",...,917568000000,,2005,11,25,1999,1,29,0.106347,0.621815
374693,1147737600000,tt0388795,ur8323774,False,absolutely the best film should have done at t...,10,Heart Wrenching,"Two young men, Ennis Del Mar and Jack Twist, m...",134,"[Drama, Romance]",...,1137110400000,"In the summer of 1963, two young men meet when...",2006,5,16,2006,1,13,0.340985,0.621667
546384,1324771200000,tt0120812,ur24340247,True,rush hour starting jackie chan chris tucker th...,10,The Best Action Comedy Movie Ever,Cultures clash and tempers flares as the two c...,98,"[Action, Comedy, Crime]",...,906076800000,When kidnappers grab a Chinese official's daug...,2011,12,25,1998,9,18,0.265476,0.472619


In [5]:
# download GoogleNews vectors negative300 for Mercari
path = kagglehub.dataset_download("leadbest/googlenewsvectorsnegative300")



In [6]:
saved_dir = Path("../data/external")
src = Path(path)

items = src.iterdir() if src.is_dir() else [src]
for p in items:
    shutil.copy2(p, saved_dir / p.name)

In [7]:
# config
# ---------- CONFIG ----------
TEXT_COL_A = "review_text"      # your review text column
TEXT_COL_B = "plot_synopsis"      # movie plot / synopsis column
LABEL_COL  = "is_spoiler"         # binary label {0,1}
MIN_FREQ   = 2
EMBEDDING_DIM = 300               
LSTM_HID   = 50
BATCH_SIZE = 64
EPOCHS     = 10

In [8]:
# model config
WORD2VEC_PATH = "../data/external/GoogleNews-vectors-negative300.bin.gz"
MAX_SEQUENCE_LENGTH = 300
VOCAB_SIZE = 20000
NUM_LSTM_UNITS = 128
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.004
NUM_CLASSES = 2

In [9]:
# prepare tokenizer
y = train[LABEL_COL].astype(int).to_numpy()

tok = Tokenizer(num_words=VOCAB_SIZE, oov_token="<unk>", lower=False) # note: GoogleNews vectors are CASE-SENSITIVE → lower=False

tok.fit_on_texts(pd.concat([train[TEXT_COL_A].astype(str),
                            train[TEXT_COL_B].astype(str)], ignore_index=True))

In [10]:
def to_seq(series):
    return pad_sequences(
        tok.texts_to_sequences(series.astype(str)),
        maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post"
    )

In [11]:
X_a = to_seq(train[TEXT_COL_A])
X_b = to_seq(train[TEXT_COL_B])
Xte_a = to_seq(test[TEXT_COL_A]) if TEXT_COL_A in test else None
Xte_b = to_seq(test[TEXT_COL_B]) if TEXT_COL_B in test else None

In [12]:
# embed matrix (Word2Vec GoogleNews)
w2v = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)
word_index = tok.word_index
vocab_size = min(VOCAB_SIZE, len(word_index) + 1)

emb_matrix = np.random.normal(0, 0.1, size=(vocab_size, EMBEDDING_DIM)).astype("float32")
emb_matrix[0] = 0.0  # pad idx 0

for w, idx in word_index.items():
    if idx >= vocab_size: 
        continue
    if w in w2v:
        emb_matrix[idx] = w2v[w]

In [13]:
def exponent_neg_manhattan_distance(inputs):
    x, y = inputs
    return K.exp(-K.sum(K.abs(x - y), axis=1, keepdims=True))

In [14]:
inp_a = Input(shape=(MAX_SEQUENCE_LENGTH,), name="sentence_A")
inp_b = Input(shape=(MAX_SEQUENCE_LENGTH,), name="sentence_B")

In [15]:
shared_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=EMBEDDING_DIM,
    weights=[emb_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    mask_zero=True,
    trainable=False,                 # freeze first (as in many tutorials)
    name="shared_embedding"
)



In [16]:
emb_a = shared_embedding(inp_a)
emb_b = shared_embedding(inp_b)

shared_lstm = LSTM(NUM_LSTM_UNITS, name="shared_lstm")
vec_a = shared_lstm(emb_a)
vec_b = shared_lstm(emb_b)

vec_a = Dropout(DROPOUT_RATE)(vec_a)
vec_b = Dropout(DROPOUT_RATE)(vec_b)

similarity = Lambda(exponent_neg_manhattan_distance, name="sim")([vec_a, vec_b])

In [17]:
model = Model([inp_a, inp_b], similarity)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss="binary_crossentropy",
    metrics=["AUC"]
)
model.summary()

In [18]:
# train and unfreeze 
history = model.fit(
    [X_a, X_b], y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
    verbose=1
)

# optional: fine-tune embeddings lightly
model.get_layer("shared_embedding").trainable = True
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss="binary_crossentropy", metrics=["AUC"])
model.fit([X_a, X_b], y, batch_size=BATCH_SIZE, epochs=2, validation_split=0.1, verbose=1)

Epoch 1/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3901s[0m 604ms/step - AUC: 0.6842 - loss: 0.5345 - val_AUC: 0.7609 - val_loss: 0.4899
Epoch 2/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4751s[0m 736ms/step - AUC: 0.7574 - loss: 0.4912 - val_AUC: 0.7708 - val_loss: 0.4861
Epoch 3/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3813s[0m 591ms/step - AUC: 0.7682 - loss: 0.4827 - val_AUC: 0.7732 - val_loss: 0.4802
Epoch 4/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3828s[0m 593ms/step - AUC: 0.7768 - loss: 0.4755 - val_AUC: 0.7767 - val_loss: 0.4762
Epoch 5/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22672s[0m 4s/step - AUC: 0.7861 - loss: 0.4680 - val_AUC: 0.7746 - val_loss: 0.4786
Epoch 6/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8716s[0m 1s/step - AUC: 0.7937 - loss: 0.4618 - val_AUC: 0.7742 - val_loss: 0.4802
Epoch 7/10
[1m6457/6457[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x84f7292a0>

In [None]:
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, precision_recall_curve
)

In [None]:
def eval_bin(model, Xa, Xb, y_true, threshold=0.5):
    """Return common binary metrics. y_true can be shape (n,) or (n,1)."""
    y_true = np.asarray(y_true).reshape(-1).astype(int)
    probs = model.predict([Xa, Xb], verbose=0).ravel()
    preds = (probs >= threshold).astype(int)
    return {
        "accuracy":  accuracy_score(y_true, preds),
        "f1":        f1_score(y_true, preds),
        "precision": precision_score(y_true, preds),
        "recall":    recall_score(y_true, preds),
        "auc":       roc_auc_score(y_true, probs),
        "cm":        confusion_matrix(y_true, preds)   # [[TN, FP], [FN, TP]]
    }, probs

n = len(y)
val_n = int(0.1 * n)  # same split Keras used (last 10% as validation)
Xa_train, Xb_train, y_train = X_a[:-val_n], X_b[:-val_n], y[:-val_n]
Xa_val,   Xb_val,   y_val   = X_a[-val_n:], X_b[-val_n:], y[-val_n:]

# evaluate at default threshold 0.5
train_metrics, _ = eval_bin(model, Xa_train, Xb_train, y_train, threshold=0.5)
val_metrics,   p_val = eval_bin(model, Xa_val,   Xb_val,   y_val,   threshold=0.5)

print("Train metrics @0.5:", {k:v for k,v in train_metrics.items() if k!='cm'})
print("Train CM:\n", train_metrics["cm"])
print("Val   metrics @0.5:", {k:v for k,v in val_metrics.items() if k!='cm'})
print("Val   CM:\n", val_metrics["cm"])

prec, rec, thr = precision_recall_curve(np.asarray(y_val).reshape(-1).astype(int), p_val)
f1s = 2 * prec * rec / (prec + rec + 1e-8)
best_idx = np.nanargmax(f1s)
best_threshold = thr[max(best_idx-1, 0)] if best_idx < len(thr) else 0.5  # map PR points to thresholds

print("Best F1 on val:", f1s[best_idx])
print("Best threshold:", best_threshold)

# re-evaluate at the best threshold
val_metrics_best, _ = eval_bin(model, Xa_val, Xb_val, y_val, threshold=best_threshold)
print("Val metrics @best-threshold:", {k:v for k,v in val_metrics_best.items() if k!='cm'})
print("Val CM @best-threshold:\n", val_metrics_best["cm"])

Train metrics @0.5: {'accuracy': 0.8454038120867738, 'f1': 0.656304306889409, 'precision': 0.7908713692946058, 'recall': 0.5608717642190445, 'auc': 0.9008652070482008}
Train CM:
 [[288339  16128]
 [ 47753  60992]]
Val   metrics @0.5: {'accuracy': 0.7699729918104199, 'f1': 0.4792663083674375, 'precision': 0.5957342485903407, 'recall': 0.40089086859688194, 'auc': 0.7601139771464638}
Val   CM:
 [[30491  3298]
 [ 7263  4860]]
Best F1 on val: 0.5461185857486093
Best threshold: 0.27818432
Val metrics @best-threshold: {'accuracy': 0.7238848231399199, 'f1': 0.5460990368434244, 'precision': 0.48247500949006705, 'recall': 0.6290522147983173, 'auc': 0.7601139771464638}
Val CM @best-threshold:
 [[25609  8180]
 [ 4497  7626]]
