In [None]:
!pip install faiss-cpu



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from retrieval import ReviewRetrieval

In [None]:
# Load NEG_LONG data
df = pd.read_csv("neg_long.csv")
df = df[["cleaned_review2", "cleaned_response2"]].dropna()

In [None]:
# ============================================================
# 2. FIRST DO TRAIN–VAL–TEST SPLIT  (70/15/15)
# ============================================================
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

print("Train:", len(train_df))
print("Val:", len(val_df))
print("Test:", len(test_df))

Train: 682
Val: 146
Test: 147


In [None]:
# ============================================================
# 3. BUILD RETRIEVER **ONLY ON TRAIN**
# ============================================================
retriever = ReviewRetrieval(
    df=train_df,                     # ← IMPORTANT
    embed_col="cleaned_review2",
    reply_col="cleaned_response2",
    model_name="sentence-transformers/all-mpnet-base-v2",
    use_gpu=True
)

Loading model: sentence-transformers/all-mpnet-base-v2 ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Using GPU for embeddings.
Encoding all reviews into embeddings...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Building FAISS index...
FAISS index built. Number of items: 682


In [None]:
# ============================================================
# 4. APPLY RETRIEVAL ONLY ON TRAIN DATA
# ============================================================
def add_retrieval_context(row):
    query = row["cleaned_review2"]
    retrieved = retriever.retrieve(query, top_k=3)
    return " ".join(retrieved)

print("Adding retrieval to TRAIN only...")
train_df["retrieved_context"] = train_df.apply(add_retrieval_context, axis=1)

# val/test → NO RETRIEVAL
val_df["retrieved_context"] = ""
test_df["retrieved_context"] = ""

Adding retrieval to TRAIN only...


In [None]:
# ---------------------------------------------------
# BUILD MODEL INPUT + TARGET
# ---------------------------------------------------
for df_ in [train_df, val_df, test_df]:
    df_["model_input"] = (
        "sentiment: negative_long | "
        "review: " + df_["cleaned_review2"] +
        " retrieved: " + df_["retrieved_context"]
    )
    df_["model_target"] = df_["cleaned_response2"]

In [None]:
# ---------------------------------------------------
# 3) MODEL = T5-BASE (UPGRADE)
# ---------------------------------------------------
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

max_input = 512
max_target = 256

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# ---------------------------------------------------
# DATASET
# ---------------------------------------------------
class ReplyDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["model_input"].tolist()
        self.targets = df["model_target"].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        src = self.inputs[idx]
        tgt = self.targets[idx]

        src_enc = tokenizer(
            src,
            max_length=max_input,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        tgt_enc = tokenizer(
            tgt,
            max_length=max_target,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "input_ids": src_enc["input_ids"].squeeze(),
            "attention_mask": src_enc["attention_mask"].squeeze(),
            "labels": tgt_enc["input_ids"].squeeze()
        }

In [None]:
# ---------------------------------------------------
# LOADERS
# ---------------------------------------------------
train_loader = DataLoader(ReplyDataset(train_df), batch_size=4, shuffle=True)
val_loader   = DataLoader(ReplyDataset(val_df), batch_size=4, shuffle=False)
test_loader  = DataLoader(ReplyDataset(test_df), batch_size=4, shuffle=False)

In [None]:
# ---------------------------------------------------
# TRAINING LOOP
# ---------------------------------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"\n===== EPOCH {epoch+1}/{epochs} =====")

    total_train_loss = 0.0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    print(f"Training Loss: {total_train_loss/len(train_loader):.4f}")

    # VALIDATION
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                labels=batch["labels"].to(device)
            )
            total_val_loss += outputs.loss.item()

    print(f"Validation Loss: {total_val_loss/len(val_loader):.4f}")

    model.train()


===== EPOCH 1/3 =====


100%|██████████| 171/171 [02:38<00:00,  1.08it/s]


Training Loss: 1.4475
Validation Loss: 2.2591

===== EPOCH 2/3 =====


100%|██████████| 171/171 [02:37<00:00,  1.09it/s]


Training Loss: 0.1613
Validation Loss: 2.1306

===== EPOCH 3/3 =====


100%|██████████| 171/171 [02:36<00:00,  1.09it/s]


Training Loss: 0.1068
Validation Loss: 2.0734


In [None]:
# Load existing test_data.csv
existing_test_df = pd.read_csv("/content/test_data.csv")

# Append new test_df into existing file
updated_test_df = pd.concat([existing_test_df, test_df], axis=0).reset_index(drop=True)

# Save back into same file
updated_test_df.to_csv("/content/test_data.csv", index=False)

print("Appended new test_df into test_data.csv successfully!")
print("Updated test size:", len(updated_test_df))

In [None]:
# ---------------------------------------------------
# TEST LOSS
# ---------------------------------------------------
model.eval()
total_test_loss = 0.0

with torch.no_grad():
    for batch in test_loader:
        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            labels=batch["labels"].to(device)
        )
        total_test_loss += outputs.loss.item()

print("\nFinal Test Loss:", total_test_loss / len(test_loader))


Final Test Loss: 2.7870


In [None]:
# ============================================================
# 11. SAVE MODEL
# ============================================================
save_path = "neg_long_generator_retrieval_new"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('neg_long_generator_retrieval_new/tokenizer_config.json',
 'neg_long_generator_retrieval_new/special_tokens_map.json',
 'neg_long_generator_retrieval_new/spiece.model',
 'neg_long_generator_retrieval_new/added_tokens.json',
 'neg_long_generator_retrieval_new/tokenizer.json')

In [None]:
# ---------------------------------------------------
# INFERENCE
# ---------------------------------------------------
def generate_neg_long_reply(review_text, top_k=3, max_len=250):
    retrieved = retriever.retrieve(review_text, top_k=top_k)
    context = " ".join(retrieved)

    final_input = (
        "sentiment: negative_long | review: " +
        review_text + " retrieved: " + context
    )

    inputs = tokenizer(final_input, return_tensors="pt", truncation=True, max_length=max_input).to(device)

    output = model.generate(
        **inputs,
        max_length=max_len,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# ============================================================
# 13. TEST ONE SAMPLE
# ============================================================
sample_review = "bad accommodation and not good food at all..very bad service"
print("\nGenerated Reply:\n", generate_reply(sample_review))


Generated Reply:
 Dear Sir, Thanks for your valuable feedback for our HOTEL_NAME>. We are sorry to hear that you did not enjoy your stay with us. However we try to improve Our services. We would welcome you once again in near future. Thanks and


In [None]:
# COPY TO DRIVE
!cp -r /content/neg_long_generator_retrieval_new "/content/drive/My Drive/"