In [None]:
!pip install faiss-cpu



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import  AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from retrieval import ReviewRetrieval  # :contentReference[oaicite:0]{index=0}
import pandas as pd

In [None]:
# Load POS_LONG data
df = pd.read_csv("pos_long.csv")
df = df[["cleaned_review2", "cleaned_response2"]].dropna()

In [None]:
retriever = ReviewRetrieval(
    df=df,
    embed_col="cleaned_review2",
    reply_col="cleaned_response2",
    model_name="sentence-transformers/all-mpnet-base-v2",
    use_gpu=True
)

Loading model: sentence-transformers/all-mpnet-base-v2 ...
GPU not available. Falling back to CPU.
Encoding all reviews into embeddings...


Batches:   0%|          | 0/127 [00:00<?, ?it/s]

Building FAISS index...
FAISS index built. Number of items: 4049


In [None]:
def add_retrieval_context(row):
    query = row["cleaned_review2"]
    retrieved_replies = retriever.retrieve(query, top_k=3)
    context = " ".join(retrieved_replies)
    return context

# Apply row-wise
df["retrieved_context"] = df.apply(add_retrieval_context, axis=1)

df[["cleaned_review2", "retrieved_context", "cleaned_response2"]].head()

In [None]:
df["model_input"] = (
    "review: " + df["cleaned_review2"] +
    " retrieved: " + df["retrieved_context"]
)

df["model_target"] = df["cleaned_response2"]

df[["model_input", "model_target"]].head()

KeyError: 'retrieved_context'

In [None]:
# First split: Train (70%) + Temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)

# Second split: Temp into Validation (15%) + Test (15%)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 2834
Validation size: 607
Test size: 608


In [None]:
model_name = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Using device:", device)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Using device: cuda


In [None]:
max_input = 512
max_target = 128

class ReplyDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["model_input"].tolist()
        self.targets = df["model_target"].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        src = self.inputs[idx]
        tgt = self.targets[idx]

        src_enc = tokenizer(
            src,
            max_length=max_input,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        tgt_enc = tokenizer(
            tgt,
            max_length=max_target,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "input_ids": src_enc["input_ids"].squeeze(),
            "attention_mask": src_enc["attention_mask"].squeeze(),
            "labels": tgt_enc["input_ids"].squeeze()
        }


In [None]:
train_dataset = ReplyDataset(train_df)
val_dataset   = ReplyDataset(val_df)
test_dataset  = ReplyDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=4, shuffle=False)

len(train_loader), len(val_loader), len(test_loader)

(709, 152, 152)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [None]:
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"\n===== EPOCH {epoch+1}/{epochs} =====")

    # --------- TRAINING ----------
    total_train_loss = 0.0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # --------- VALIDATION ----------
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention,
                labels=labels
            )

            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    model.train()


===== EPOCH 1/3 =====


100%|██████████| 709/709 [02:18<00:00,  5.13it/s]


Training Loss: 0.4903
Validation Loss: 0.1332

===== EPOCH 2/3 =====


100%|██████████| 709/709 [02:26<00:00,  4.83it/s]


Training Loss: 0.1171
Validation Loss: 0.1110

===== EPOCH 3/3 =====


100%|██████████| 709/709 [02:19<00:00,  5.08it/s]


Training Loss: 0.0859
Validation Loss: 0.0998


In [None]:
model.eval()
total_test_loss = 0.0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention,
            labels=labels
        )

        total_test_loss += outputs.loss.item()

avg_test_loss = total_test_loss / len(test_loader)
print(f"\nFinal Test Loss: {avg_test_loss:.4f}")


Final Test Loss: 0.0735


In [None]:
save_path = "pos_long_generator_retrieval"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("POS_LONG retrieval-augmented generator saved at:", save_path)

POS_LONG retrieval-augmented generator saved at: pos_long_generator_retrieval


In [None]:
def generate_reply(review_text, top_k=3, max_len=150):
    # 1) get retrieval context for this new review
    retrieved_replies = retriever.retrieve(review_text, top_k=top_k)
    context = " ".join(retrieved_replies)

    # 2) build input in same format as training
    final_input = "review: " + review_text + " retrieved: " + context

    # 3) encode and generate
    inputs = tokenizer(final_input, return_tensors="pt", truncation=True, max_length=max_input).to(device)
    output = model.generate(
        **inputs,
        max_length=max_len,
        num_beams=5,
        early_stopping=True
    )

    reply = tokenizer.decode(output[0], skip_special_tokens=True)
    return reply

In [None]:
sample_review = "Exclusively for good accommodation "
generated = generate_reply(sample_review)
print("REVIEW:")
print(sample_review)
print("\nGENERATED REPLY:")
print(generated)


REVIEW:
Exclusively for good accommodation 

GENERATED REPLY:
Dear Guest, Thank you for selecting HOTEL NAME, LOCATION> as your accommodation choice and for taking the time to share your compliments with us. It is heartening to note that you had an overall pleasant stay and appreciated the hotel and its facilities. Your feedback is most rewarding as it is our endeavor to ensure that our guests have memorable experiences each time they stay with us. We look forward to welcoming you again soon and hope that HOTEL_NAME> remains your preferred hotel on all your future visits to LOCATION>


In [None]:
!cp -r /content/pos_long_generator_retrieval "/content/drive/My Drive/"