Pip Installs

In [None]:
!pip install -q torch "sentence-transformers>=2.4" "transformers>=4.40" \
                 scikit-learn spacy vadersentiment textblob
!python -m textblob.download_corpora -q
!python -m spacy download en_core_web_sm -q


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the pac

Imports + Data Loading

In [None]:
import os, re, textwrap, random, warnings, numpy as np, pandas as pd, torch
from collections import Counter
warnings.filterwarnings("ignore")

CSV_PATH = "Movies_27K_Reviews.csv"
assert os.path.exists(CSV_PATH), f"CSV not found at {CSV_PATH} – upload or fix the path!"
print("✓ Found file:", CSV_PATH)


✓ Found file: Movies_27K_Reviews.csv


In [None]:
df_raw = pd.read_csv(CSV_PATH, low_memory=False)
df_raw.columns = (df_raw.columns
                    .str.strip()
                    .str.lower()
                    .str.replace(":", "", regex=False)
                    .str.replace(r"\s+", "_", regex=True))
print("✓ Loaded →", df_raw.shape)
print("Columns:", sorted(df_raw.columns.tolist())[:12])


✓ Loaded → (26811, 18)
Columns: ['audience_review', 'audience_score', 'box_office', 'critic_consensus', 'critic_line', 'critic_score', 'critic_url', 'directed_by', 'genre', 'in_theaters', 'movie_name', 'on_disc/streaming']


Data Cleaning, Studio Filtering

In [None]:
alias = {r"(?i)^(20th.*|fox.*)$":"20th Century",
         r"(?i)^(columbia.*|sony.*)$":"Columbia/Sony",
         r"(?i)^(walt disney.*|disney\+.*|pixar.*|marvel.*)$":"Disney",
         r"(?i)^paramount.*$":"Paramount",
         r"(?i)^universal.*$":"Universal",
         r"(?i)^(warner.*|new line.*)$":"Warner Brothers"}

def map_studio(s):
    for pat, canon in alias.items():
        if re.match(pat, str(s)): return canon
    return np.nan

df = (df_raw
        .assign(studio=df_raw["studio"].map(map_studio))
        .dropna(subset=["studio"])
        [["studio","rating","genre","movie_name",
          "critic_score","audience_score",
          "critic_line","audience_review"]]
        .reset_index(drop=True))

print("✓ After studio filter →", df.shape)


✓ After studio filter → (4871, 8)


In [None]:
for col in ["critic_score", "audience_score"]:
    df[col] = (df[col].astype(str)
                        .str.replace("%", "", regex=False)
                        .str.strip()
                        .replace("", np.nan)
                        .astype(float))
print("✓ Scores cleaned – sample:", df[[ "critic_score", "audience_score"]].head(3))


✓ Scores cleaned – sample:    critic_score  audience_score
0          75.0            51.0
1          98.0             NaN
2          78.0            50.0


In [None]:
df_dedup = df.drop_duplicates(
    subset=["studio","movie_name","critic_line","audience_review"])
print("✓ After de-dup →", df_dedup.shape)


✓ After de-dup → (1882, 8)


In [None]:
movie_groups = df_dedup.groupby(["studio","movie_name"], sort=False)
print("✓ Movie groups:", len(movie_groups))


✓ Movie groups: 1847


Summarizer Initializing

In [None]:
from transformers import pipeline

device_id = 0 if torch.cuda.is_available() else -1
summarizer = pipeline("summarization",
                      model="t5-small",
                      tokenizer="t5-small",
                      framework="pt",
                      device=device_id)

def summarise(text:str, max_words:int=150) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""
    summary = summarizer(
        text[:4000].strip(),
        max_length=int(max_words*1.6),
        min_length=int(max_words*0.4),
        do_sample=False,
        truncation=True
    )[0]["summary_text"]
    return textwrap.shorten(summary.replace("\n"," "),
                            width=max_words*7, placeholder="…")

print("✓ Summariser ready on", "GPU" if device_id==0 else "CPU")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


✓ Summariser ready on GPU


One-per-row Dataframe builder

In [None]:
records = []
for (studio, movie), g in movie_groups:
    critic_text   = " ".join(g["critic_line"].dropna().unique())[:4000]
    audience_text = " ".join(g["audience_review"].dropna().unique())[:4000]
    records.append({
        "studio"          : studio,
        "movie"           : movie,
        "critic_summary"  : summarise(critic_text),
        "audience_summary": summarise(audience_text),
        "critic_score"    : g["critic_score"].mean(),
        "audience_score"  : g["audience_score"].mean(),
        "genre"           : Counter(g["genre"]).most_common(1)[0][0]
                           if g["genre"].notna().any() else ""
    })

df_sum = pd.DataFrame(records)
print("✓ Summaries built →", df_sum.shape)
df_sum.head(2)


Your max_length is set to 240, but your input_length is only 141. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=70)
Your max_length is set to 240, but your input_length is only 202. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=101)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your max_length is set to 240, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 240, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input a

✓ Summaries built → (1847, 7)


Unnamed: 0,studio,movie,critic_summary,audience_summary,critic_score,audience_score,genre
0,Disney,Mulan,"this movie is just like the original, while re...",the new Mulan 2020 looks like something I've b...,75.0,51.0,"Action & Adventure, Drama, Kids & Family"
1,Universal,Red Penguins,,,98.0,,Documentary


In [None]:
df_sum.head()

Unnamed: 0,studio,movie,critic_summary,audience_summary,critic_score,audience_score,genre,critic_summary_sim,aud_summary_sim,critic_sent_llm,critic_sent_vader,aud_sent_llm,aud_sent_vader,critic_emb,aud_emb
0,Disney,Mulan,"this movie is just like the original, while re...",the new Mulan 2020 looks like something I've b...,75.0,51.0,"Action & Adventure, Drama, Kids & Family",0.1851,0.281544,0.939161,0.7814,-0.998788,0.6808,"[tensor(-0.0716), tensor(0.0228), tensor(-0.00...","[tensor(-0.0946), tensor(-0.0043), tensor(0.08..."
1,Universal,Red Penguins,,,98.0,,Documentary,,,,,,,"[tensor(-0.1188), tensor(0.0483), tensor(-0.00...","[tensor(-0.1188), tensor(0.0483), tensor(-0.00..."
2,Warner Brothers,Deathstroke: Knights & Dragons,,this one is given a spin in that this one has ...,78.0,50.0,"Action & Adventure, Animation",,0.552863,,,-0.989071,0.7268,"[tensor(-0.1188), tensor(0.0483), tensor(-0.00...","[tensor(-0.0966), tensor(-0.0049), tensor(-0.0..."
3,Warner Brothers,Deep Blue Sea 3,,"a cast of shark bait that, now get this, all g...",69.0,37.0,Horror,,0.550154,,,0.967815,0.0,"[tensor(-0.1188), tensor(0.0483), tensor(-0.00...","[tensor(-0.0870), tensor(-0.0178), tensor(0.00..."
4,Universal,Viena and the Fantomes,,,20.0,25.0,Drama,,,,,,,"[tensor(-0.1188), tensor(0.0483), tensor(-0.00...","[tensor(-0.1188), tensor(0.0483), tensor(-0.00..."


Summary-vs-Source Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vec = TfidfVectorizer(stop_words="english", max_features=20_000)

def sim_quality(orig, summ):
    if not orig or not summ: return np.nan
    try:
        X = vec.fit_transform([orig, summ])
        return cosine_similarity(X[0], X[1])[0,0]
    except ValueError:
        return np.nan

df_sum["critic_summary_sim"] = [
    sim_quality(" ".join(movie_groups.get_group((r.studio, r.movie))["critic_line"]
                         .dropna().unique()), r.critic_summary)
    for r in df_sum.itertuples()
]

df_sum["aud_summary_sim"] = [
    sim_quality(" ".join(movie_groups.get_group((r.studio, r.movie))["audience_review"]
                         .dropna().unique()), r.audience_summary)
    for r in df_sum.itertuples()
]

df_sum[["critic_summary_sim","aud_summary_sim"]].describe()


Unnamed: 0,critic_summary_sim,aud_summary_sim
count,1546.0,1744.0
mean,0.260045,0.38433
std,0.089453,0.13591
min,0.074324,0.079533
25%,0.204988,0.293979
50%,0.234342,0.366689
75%,0.280775,0.447327
max,0.856581,1.0


Dual Sentiment Extraction + Correlations

In [None]:
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.stats import pearsonr

sent_llm = pipeline("sentiment-analysis",
                    model="distilbert-base-uncased-finetuned-sst-2-english",
                    device=device_id)
vader = SentimentIntensityAnalyzer()

def score_sent(text, kind="llm"):
    if not text: return np.nan
    if kind=="llm":
        res = sent_llm(text[:512])[0]
        return res["score"]*(1 if res["label"]=="POSITIVE" else -1)
    return vader.polarity_scores(text)["compound"]

for col,source in [("critic","critic_summary"), ("aud","audience_summary")]:
    df_sum[f"{col}_sent_llm"]   = df_sum[source].map(lambda t: score_sent(t,"llm"))
    df_sum[f"{col}_sent_vader"] = df_sum[source].map(lambda t: score_sent(t,"vader"))

for c in ["critic","aud"]:
    for m in ["llm","vader"]:
        r, p = pearsonr(df_sum[f"{c}_sent_{m}"].dropna(),
                        df_sum.loc[df_sum[f"{c}_sent_{m}"].notna(),
                                   f"{c}ience_score" if c=="aud" else "critic_score"])
        print(f"{c.title()} ({m}) ↔ score  r={r:.3f}  p={p:.3g}")


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


Critic (llm) ↔ score  r=0.436  p=1.18e-72
Critic (vader) ↔ score  r=0.229  p=7.79e-20
Aud (llm) ↔ score  r=0.312  p=1.1e-40
Aud (vader) ↔ score  r=0.191  p=9.12e-16


Recommender System

In [None]:
# --------------Embeddings + recommender ---------------------------------------
from sentence_transformers import SentenceTransformer, util
import numpy as np, torch

embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device_id)

critic_embs = embedder.encode(
    df_sum["critic_summary"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
).cpu()
aud_embs = embedder.encode(
    df_sum["audience_summary"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
).cpu()

df_sum["critic_emb"] = list(critic_embs)
df_sum["aud_emb"]    = list(aud_embs)

critic_mat = critic_embs
aud_mat    = aud_embs

def recommend(query: str, k: int = 3):
    q = embedder.encode([query], convert_to_tensor=True).cpu()
    crit_sim = util.cos_sim(q, critic_mat)[0].numpy()
    aud_sim  = util.cos_sim(q, aud_mat)[0].numpy()

    top_crit = df_sum.iloc[crit_sim.argsort()[-k:][::-1]][["movie", "studio", "genre"]]
    top_aud  = df_sum.iloc[aud_sim.argsort()[-k:][::-1]][["movie", "studio", "genre"]]
    return top_crit, top_aud

crit, aud = recommend("heart-warming animated adventure")
print("Critic picks:\n", crit, "\n\nAudience picks:\n", aud)


Batches:   0%|          | 0/58 [00:00<?, ?it/s]

Batches:   0%|          | 0/58 [00:00<?, ?it/s]

Critic picks:
                       movie           studio  \
784   Rise of the Guardians        Paramount   
274             Arctic Tale        Paramount   
1517         Be Kind Rewind  Warner Brothers   

                                             genre  
784   Action & Adventure, Animation, Kids & Family  
274                  Documentary, Special Interest  
1517                                        Comedy   

Audience picks:
                             movie     studio  \
912      The Adventures of Tintin  Paramount   
1399  Madagascar: Escape 2 Africa  Paramount   
314                         Moana     Disney   

                                                  genre  
912        Action & Adventure, Animation, Kids & Family  
1399  Action & Adventure, Animation, Comedy, Kids & ...  
314        Action & Adventure, Animation, Kids & Family  


Evaluation

In [None]:

test_queries = [
    "heart-warming animated family adventure",
    "dark psychological thriller with shocking twists",
    "fast-paced car-racing action movie",
    "romantic coming-of-age high-school drama",
    "epic sci-fi space opera with a hero’s journey",
    "quirky indie comedy about self-discovery",
    "gritty crime story set in the 1970s",
    "feel-good sports underdog victory",
    "slow-burn atmospheric horror",
    "historical biopic with political intrigue"
]

def evaluate(queries, k=3):
    rows = []
    for q in queries:
        q_emb = embedder.encode([q], convert_to_tensor=True).cpu()
        crit_sim = util.cos_sim(q_emb, critic_mat)[0].numpy()
        aud_sim  = util.cos_sim(q_emb,   aud_mat)[0].numpy()

        crit_top_idx = crit_sim.argsort()[-k:][::-1]
        aud_top_idx  = aud_sim.argsort()[-k:][::-1]

        rows.append({
            "query": q,
            "critic_recs": "; ".join(df_sum.iloc[crit_top_idx]["movie"]),
            "aud_recs"   : "; ".join(df_sum.iloc[aud_top_idx]["movie"]),
            "best_critic_score"  : crit_sim[crit_top_idx[0]],
            "best_audience_score": aud_sim [aud_top_idx [0]],
            "recip_rank_critic"  : 1 / (np.where(crit_top_idx == crit_top_idx[0])[0][0] + 1),
            "recip_rank_audience": 1 / (np.where(aud_top_idx  == aud_top_idx [0])[0][0] + 1)
        })
    return pd.DataFrame(rows)

eval_df = evaluate(test_queries)

mrr_critic   = eval_df["recip_rank_critic"].mean()
mrr_audience = eval_df["recip_rank_audience"].mean()
better = "critic" if mrr_critic > mrr_audience else "audience"

print(eval_df[["query","critic_recs","aud_recs",
               "best_critic_score","best_audience_score"]])

print(f"\nMean Reciprocal Rank – critic summaries:  {mrr_critic:.3f}")
print(f"Mean Reciprocal Rank – audience summaries: {mrr_audience:.3f}")
print(f"\n  Overall, the {better} channel produced the stronger matches on this hold-out set.")


                                              query  \
0           heart-warming animated family adventure   
1  dark psychological thriller with shocking twists   
2                fast-paced car-racing action movie   
3          romantic coming-of-age high-school drama   
4     epic sci-fi space opera with a hero’s journey   
5          quirky indie comedy about self-discovery   
6               gritty crime story set in the 1970s   
7                 feel-good sports underdog victory   
8                      slow-burn atmospheric horror   
9         historical biopic with political intrigue   

                                         critic_recs  \
0                 Wreck-it Ralph; The Croods; Frozen   
1                      Chloe; Broken City; Deception   
2                 Need For Speed; Unknown; Fast Five   
3        Going the Distance; A Dangerous Method; Her   
4        The Revenant; The Comedian; Never Let Me Go   
5                Megamind; The LEGO Movie; The Watch   
6 

Exports

In [None]:
from pathlib import Path
import numpy as np

OUT_DIR = Path("processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

df_sum.drop(columns=["critic_emb", "aud_emb"]) \
      .to_csv(OUT_DIR / "movies_cleaned_summaries.csv", index=False)

np.save(OUT_DIR / "critic_embeddings.npy",
        np.vstack([t.numpy() for t in df_sum["critic_emb"]]))
np.save(OUT_DIR / "aud_embeddings.npy",
        np.vstack([t.numpy() for t in df_sum["aud_emb"]]))

print("Files written to", OUT_DIR.resolve())


✓ Files written to /content/processed
