In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

INPUT_PATH = "/Users/animchnlap314/Documents/llm_recsys_dissertation/data/processed/movies_tv_processed.csv"
TFIDF_RESULTS_PATH = "/Users/animchnlap314/Documents/llm_recsys_dissertation/data/results/04_tfidf_metrics.csv"


In [2]:
df = pd.read_csv(INPUT_PATH)

print("Loaded:", df.shape)
df.head(2)


Loaded: (200000, 13)


Unnamed: 0,reviewerID,asin,overall,reviewText_clean,summary_clean,review_length,word_count,review_datetime,review_year,review_month,review_day,review_dayofweek,review_week
0,A220IS03GFT82A,B00G7M190U,5.0,this is indeed one of the best movies of the y...,enjoyed every minute of it,620,114,2014-02-05,2014,2,5,2,6
1,A38IFD1PGGWSRA,B00076ON28,5.0,seven people shipwrecked on a tropical island ...,not gilligan is island,796,143,2009-06-01,2009,6,1,0,23


In [3]:
df["item_text"] = (
    df["summary_clean"].fillna("") + " " +
    df["reviewText_clean"].fillna("")
).str.strip()

df["item_text"] = df["item_text"].str.slice(0, 512)  # keep prompts compact

df[["asin", "item_text"]].head(2)


Unnamed: 0,asin,item_text
0,B00G7M190U,enjoyed every minute of it this is indeed one ...
1,B00076ON28,not gilligan is island seven people shipwrecke...


In [4]:
item_df = (
    df.groupby("asin")
    .agg({
        "item_text": "first",
        "overall": "mean",
        "reviewerID": "count"
    })
    .rename(columns={"reviewerID": "num_reviews"})
    .reset_index()
)

item_df.head(3)


Unnamed: 0,asin,item_text,overall,num_reviews
0,143588,i wish she would release more dvds these are i...,5.0,2
1,1526863,this was a staple in our house when the kids t...,5.0,1
2,5000009,interesting but only loosely scriptural engagi...,4.4,5


In [6]:
item_df = item_df[item_df["num_reviews"] >= 10].copy()
print("Items retained:", item_df.shape)


Items retained: (4230, 4)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(
    max_features=15000,
    stop_words="english"
)

tfidf_matrix = tfidf.fit_transform(item_df["item_text"])
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (4230, 13177)


In [8]:
def recommend_items_tfidf(item_index, top_k=5):
    query_vec = tfidf_matrix[item_index]
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    sims[item_index] = -1  # exclude self
    top_idx = sims.argsort()[-top_k:][::-1]
    
    return item_df.iloc[top_idx][["asin", "overall", "num_reviews"]]


In [9]:
recommend_items_tfidf(0, top_k=5)


Unnamed: 0,asin,overall,num_reviews
1573,6301967275,4.709677,31
1122,6301175239,4.866667,15
34693,B00MP2FHNG,4.25,16
20069,B001AQR3LC,4.684211,19
37248,B017JOME6M,4.818182,11


In [10]:
def build_llm_prompt(seed_item):
    prompt = f"""
You are a recommender system for movies and TV shows.

A user liked the following item:
Title/Description:
{seed_item['item_text']}

Average Rating: {seed_item['overall']}
Number of Reviews: {seed_item['num_reviews']}

Recommend 3 similar items and explain WHY each is relevant.
Focus on themes, tone, and audience preferences.
"""
    return prompt.strip()


In [11]:
sample_prompt = build_llm_prompt(item_df.iloc[0])
print(sample_prompt[:800])


You are a recommender system for movies and TV shows.

A user liked the following item:
Title/Description:
an american christmas carol actor henry winkler among the best of this beloved christmas story we enjoy henry winkler is twist on scrooge and the story line he makes for a very believable scrooge both pre and post redemption

Average Rating: 4.7
Number of Reviews: 20

Recommend 3 similar items and explain WHY each is relevant.
Focus on themes, tone, and audience preferences.


In [12]:
def simulated_llm_response(recommended_items):
    explanations = []
    for _, row in recommended_items.iterrows():
        explanations.append({
            "asin": row["asin"],
            "explanation": (
                "This item shares similar themes and narrative style, "
                "appealing to audiences who enjoyed the original content."
            )
        })
    return explanations


In [13]:
def llm_enhanced_recommendation(seed_index, top_k=3):
    seed_item = item_df.iloc[seed_index]
    
    # Stage 1: Retrieve candidates
    candidates = recommend_items_tfidf(seed_index, top_k=top_k)
    
    # Stage 2: Explain using LLM-style reasoning
    explanations = simulated_llm_response(candidates)
    
    output = candidates.copy()
    output["llm_explanation"] = [e["explanation"] for e in explanations]
    
    return output


In [14]:
llm_enhanced_recommendation(0, top_k=3)


Unnamed: 0,asin,overall,num_reviews,llm_explanation
1573,6301967275,4.709677,31,This item shares similar themes and narrative ...
1122,6301175239,4.866667,15,This item shares similar themes and narrative ...
34693,B00MP2FHNG,4.25,16,This item shares similar themes and narrative ...


In [15]:
peft_plan = {
    "base_model": "Llama-3.1-8B-Instruct (or Mistral-7B-Instruct)",
    "task": "Explainable recommendation + re-ranking",
    "training_data": "Pairs of (seed_item_text, candidate_item_texts) -> ranked_list + explanation",
    "method": "PEFT (LoRA)",
    "target_modules": ["q_proj", "v_proj"],   # typical LoRA targets in transformer attention
    "rank_r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "batch_size": 8,
    "epochs": 1,
    "max_seq_len": 512,
    "why_peft": "Reduce compute cost; update only small adapter weights; prevents full model retraining"
}

peft_plan


{'base_model': 'Llama-3.1-8B-Instruct (or Mistral-7B-Instruct)',
 'task': 'Explainable recommendation + re-ranking',
 'training_data': 'Pairs of (seed_item_text, candidate_item_texts) -> ranked_list + explanation',
 'method': 'PEFT (LoRA)',
 'target_modules': ['q_proj', 'v_proj'],
 'rank_r': 8,
 'lora_alpha': 16,
 'lora_dropout': 0.05,
 'batch_size': 8,
 'epochs': 1,
 'max_seq_len': 512,
 'why_peft': 'Reduce compute cost; update only small adapter weights; prevents full model retraining'}

In [16]:
import re

def diversity_at_k(recs):
    return len(set(recs)) / max(len(recs), 1)

def explanation_length(exp):
    return len(exp.split()) if isinstance(exp, str) else 0

def keyword_overlap_score(seed_text, cand_text, explanation, top_n=20):
    # naive keyword overlap proxy (defensible as heuristic)
    def top_keywords(text):
        words = re.findall(r"[a-zA-Z]{3,}", str(text).lower())
        freq = pd.Series(words).value_counts()
        return set(freq.head(top_n).index)

    seed_kw = top_keywords(seed_text)
    cand_kw = top_keywords(cand_text)
    exp_kw = top_keywords(explanation)

    shared = (seed_kw | cand_kw) & exp_kw
    return len(shared) / max(len(exp_kw), 1)


In [17]:
np.random.seed(42)
sample_indices = np.random.choice(len(item_df), size=50, replace=False)

rows = []

for idx in sample_indices:
    seed = item_df.iloc[idx]
    
    # TF-IDF recs
    candidates = recommend_items_tfidf(idx, top_k=3)
    tfidf_asins = candidates["asin"].tolist()
    
    # LLM enhanced (simulated explanations)
    out = llm_enhanced_recommendation(idx, top_k=3)
    llm_asins = out["asin"].tolist()
    
    # Evaluate explanations (simulated)
    for j in range(len(out)):
        cand_asin = out.iloc[j]["asin"]
        cand_text = item_df[item_df["asin"] == cand_asin]["item_text"].values[0]
        exp = out.iloc[j]["llm_explanation"]
        
        rows.append({
            "seed_asin": seed["asin"],
            "rec_asin": cand_asin,
            "diversity@3": diversity_at_k(llm_asins),
            "explanation_len": explanation_length(exp),
            "faithfulness_proxy": keyword_overlap_score(seed["item_text"], cand_text, exp)
        })

eval_df = pd.DataFrame(rows)
eval_df.describe()


Unnamed: 0,diversity@3,explanation_len,faithfulness_proxy
count,150.0,150.0,150.0
mean,1.0,16.0,0.118667
std,0.0,0.0,0.083856
min,1.0,16.0,0.0
25%,1.0,16.0,0.066667
50%,1.0,16.0,0.133333
75%,1.0,16.0,0.2
max,1.0,16.0,0.333333


In [19]:
eval_summary = eval_df[["diversity@3", "explanation_len", "faithfulness_proxy"]].mean().to_frame("mean").reset_index()
eval_summary.to_csv("/Users/animchnlap314/Documents/llm_recsys_dissertation/results/07_llm_eval_summary.csv", index=False)
print("Saved:", "../results/07_llm_eval_summary.csv")
eval_summary


Saved: ../results/07_llm_eval_summary.csv


Unnamed: 0,index,mean
0,diversity@3,1.0
1,explanation_len,16.0
2,faithfulness_proxy,0.118667
