In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [None]:
# Load existing test_data.csv
existing_test_df = pd.read_csv("/content/test_data.csv")

# Append new test_df into existing file
updated_test_df = pd.concat([existing_test_df, test_df], axis=0).reset_index(drop=True)

# Save back into same file
updated_test_df.to_csv("/content/test_data.csv", index=False)

print("Appended new test_df into test_data.csv successfully!")
print("Updated test size:", len(updated_test_df))

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from retrieval import ReviewRetrieval  # :contentReference[oaicite:0]{index=0}
import pandas as pd

In [None]:
# Load NEG_SHORT data
df = pd.read_csv("neg_short.csv")
df = df[["cleaned_review2", "cleaned_response2"]].dropna()

In [None]:
# ------# -------------------------
# 2. FIRST: Split the data
# -------------------------
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

print("Train:", len(train_df))
print("Val:", len(val_df))
print("Test:", len(test_df))

Train: 368
Val: 79
Test: 79


In [None]:
# -------------------------
# 3. Retriever built on TRAIN only
# -------------------------
retriever = ReviewRetrieval(
    df=train_df,
    embed_col="cleaned_review2",
    reply_col="cleaned_response2",
    model_name="sentence-transformers/all-mpnet-base-v2",
    use_gpu=True
)

Loading model: sentence-transformers/all-mpnet-base-v2 ...
Using GPU for embeddings.
Encoding all reviews into embeddings...


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Building FAISS index...
FAISS index built. Number of items: 368


In [None]:
# -------------------------
# 4. Retrieval ONLY for train_df
# -------------------------
def add_retrieval_context(row):
    query = row["cleaned_review2"]
    retrieved = retriever.retrieve(query, top_k=3)
    return " ".join(retrieved)

print("Adding retrieval to TRAIN only...")
train_df["retrieved_context"] = train_df.apply(add_retrieval_context, axis=1)

Adding retrieval to TRAIN only...


In [None]:
# -------------------------
# 5. For val_df & test_df → NO retrieval
# -------------------------
val_df["retrieved_context"] = ""
test_df["retrieved_context"] = ""

In [None]:
# -------------------------------
# BUILD MODEL INPUT AND TARGET
# -------------------------------
for split in [train_df, val_df, test_df]:
    split["model_input"] = (
        "review: " + split["cleaned_review2"] +
        " retrieved: " + split["retrieved_context"]
    )
    split["model_target"] = split["cleaned_response2"]

In [None]:
test_df.to_csv("/content/test_data_new.csv", index=False)
print("Test data saved as test_data_new.csv")

Test data saved as test_data_new.csv


In [None]:
model_name = "t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Using device:", device)

Using device: cuda


In [None]:
max_input = 512
max_target = 64

class ReplyDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["model_input"].tolist()
        self.targets = df["model_target"].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        src = self.inputs[idx]
        tgt = self.targets[idx]

        src_enc = tokenizer(
            src,
            max_length=max_input,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        tgt_enc = tokenizer(
            tgt,
            max_length=max_target,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "input_ids": src_enc["input_ids"].squeeze(),
            "attention_mask": src_enc["attention_mask"].squeeze(),
            "labels": tgt_enc["input_ids"].squeeze()
        }


In [None]:
# -------------------------------
# LOADERS
# -------------------------------
train_dataset = ReplyDataset(train_df)
val_dataset   = ReplyDataset(val_df)
test_dataset  = ReplyDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
# -------------------------------
# TRAINING LOOP
# -------------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

epochs = 3
model.train()

for epoch in range(epochs):
    print(f"\n===== EPOCH {epoch+1}/{epochs} =====")

    total_train_loss = 0.0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    print(f"Training Loss: {total_train_loss/len(train_loader):.4f}")

    # VALIDATION
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                labels=batch["labels"].to(device)
            )
            total_val_loss += outputs.loss.item()

    print(f"Validation Loss: {total_val_loss/len(val_loader):.4f}")
    model.train()


===== EPOCH 1/3 =====


100%|██████████| 92/92 [00:53<00:00,  1.72it/s]


Training Loss: 0.6968
Validation Loss: 3.4912

===== EPOCH 2/3 =====


100%|██████████| 92/92 [00:54<00:00,  1.70it/s]


Training Loss: 0.2044
Validation Loss: 3.2353

===== EPOCH 3/3 =====


100%|██████████| 92/92 [00:55<00:00,  1.66it/s]


Training Loss: 0.1353
Validation Loss: 3.1387


In [None]:
# -------------------------------
# TEST LOSS
# -------------------------------
model.eval()
test_loss = 0.0

with torch.no_grad():
    for batch in test_loader:
        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            labels=batch["labels"].to(device)
        )
        test_loss += outputs.loss.item()

print("\nFinal Test Loss:", test_loss/len(test_loader))


Final Test Loss: 3.2434432983398436


In [None]:
save_path = "neg_short_generator_retrieval_new"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("NEG_SHORT retrieval generator saved at:", save_path)

NEG_SHORT retrieval generator saved at: neg_short_generator_retrieval_new


In [None]:
# -------------------------------
# INFERENCE
# -------------------------------
def generate_reply(review_text, top_k=3, max_len=150):
    retrieved = retriever.retrieve(review_text, top_k=top_k)
    context = " ".join(retrieved)

    final_input = "review: " + review_text + " retrieved: " + context

    inputs = tokenizer(final_input, return_tensors="pt", truncation=True, max_length=max_input).to(device)
    output = model.generate(
        **inputs,
        max_length=max_len,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
sample_review = "bad accommodation "
generated = generate_reply(sample_review)
print("REVIEW:")
print(sample_review)
print("\nGENERATED REPLY:")
print(generated)


REVIEW:
bad accommodation 

GENERATED REPLY:
Dear Guest, Thank you for choosing to stay with us at HOTEL NAME Jaipur and taking your precious time to rate us on Trip Advisor. We regret the fact the cleanliness didn't quiet stand up to your expectations. Also the fact you were not able to use the changing room near the swimming pool


In [None]:
!cp -r /content/neg_short_generator_retrieval_new "/content/drive/My Drive/"