In [4]:
import pandas as pd
import numpy as np

## Clean and prep data for fine-tuning

In [5]:
def clean_imdb_series_data(csv_path="./data/IMDB_series_data.csv"):
    """
    Cleans the IMDB_series_data.csv based on known column headers:
    
    Poster_Link
    Series_Title
    Runtime_of_Series
    Certificate
    Runtime_of_Episodes
    Genre
    IMDB_Rating
    Overview
    Star1
    Star2
    Star3
    Star4
    No_of_Votes
    """

    df = pd.read_csv(csv_path)
    df.rename(columns={
        "Series_Title": "title",
        "Certificate": "contentRating",
        "Genre": "genres",
        "Poster_Link": "posterLink",
        "Runtime_of_Series": "seriesRuntime",
        "Runtime_of_Episodes": "episodeRuntime_minutes",
        "IMDB_Rating": "imdbRating",
        "Overview": "overview",
        "Star1": "star1",
        "Star2": "star2",
        "Star3": "star3",
        "Star4": "star4",
        "No_of_Votes": "numVotes"
    }, inplace=True)

    # Convert numeric columns where appropriate
    numeric_cols = ["imdbRating", "numVotes"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    
    # print (df["episodeRuntime_minutes"].unique())
    # df["episodeRuntime_minutes"] = df["episodeRuntime_minutes"].apply(lambda row: [int(x.split()[0]) for x in row])
    df["episodeRuntime_minutes"] = df["episodeRuntime_minutes"].astype(str)                   # ensure it's string
    df["episodeRuntime_minutes"] = df["episodeRuntime_minutes"].str.replace(" min", "")       # remove trailing " min"
    df["episodeRuntime_minutes"] = pd.to_numeric(df["episodeRuntime_minutes"], errors="coerce")


    # Create an "actors" column by combining star1, star2, star3, star4. Filter out any missing or NaN values
    df["actors"] = df[["star1", "star2", "star3", "star4"]].values.tolist()
    df["actors"] = df["actors"].apply(lambda row: [actor for actor in row if isinstance(actor, str) and actor.strip()])

    # Clean up genres
    if "genres" in df.columns:
        df["genres"] = df["genres"].fillna("").apply(
            lambda x: [g.strip() for g in x.split(",") if g.strip()]
        )

    # Clean and unify the contentRating
    if "contentRating" in df.columns:
        df["contentRating"] = df["contentRating"].fillna("Not Rated")
        df["contentRating"] = df["contentRating"].astype(str)
        rating_age_map = {
            "A": 18,
            "18": 18,
            "18+": 18,
            "13+": 13,
            "15": 15,
            "UA": 12,
            "U": 0,
            "15+": 15,
            "16+": 16,
            "12+": 12,
            "13": 13,
            "16": 16,
            "7": 7,
            "7+": 7,
            "PG": 10,
            "All": 0,
            "Not Rated": 18,
            "R": 17,
            "NaN": 18
        }
        df["contentRating"] = df["contentRating"].replace(rating_age_map)
        df["contentRating"] = df["contentRating"].astype(int)

    df.drop(["seriesRuntime", "posterLink"], axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

cleaned_df = clean_imdb_series_data("./data/IMDB_series_data.csv")
print(cleaned_df[0:2])
print(f"Number of cleaned records: {len(cleaned_df)}")


             title  contentRating  episodeRuntime_minutes  \
0  Game of Thrones             18                    57.0   
1     Breaking Bad             18                    49.0   

                       genres  imdbRating  \
0  [Action, Adventure, Drama]         9.3   
1    [Crime, Drama, Thriller]         9.5   

                                            overview           star1  \
0  Nine noble families fight for control over the...   Emilia Clarke   
1  A high school chemistry teacher diagnosed with...  Bryan Cranston   

            star2          star3         star4  numVotes  \
0  Peter Dinklage  Kit Harington   Lena Headey   1773458   
1      Aaron Paul      Anna Gunn  Betsy Brandt   1468887   

                                              actors  
0  [Emilia Clarke, Peter Dinklage, Kit Harington,...  
1  [Bryan Cranston, Aaron Paul, Anna Gunn, Betsy ...  
Number of cleaned records: 2000


  df["contentRating"] = df["contentRating"].replace(rating_age_map)


In [None]:
# from transformers import pipeline, set_seed

# set_seed(42)  # for reproducibility
# gpt2_pipeline = pipeline(
#     "text-generation",
#     model="gpt2",
#     pad_token_id=50256,
#     max_new_tokens=128,
#     temperature=0.7,
#     top_k=50,
#     top_p=0.95,
#     repetition_penalty=1.2
# )

Device set to use mps:0


In [28]:
import openai
import os

# Ensure API key is set
openai.api_key = os.getenv("OPENAI_API_KEY")

def openai_generate(prompt: str, model: str = "gpt-4o-mini", temperature=0.7, max_tokens=200):
    """
    Calls the OpenAI API with the given model (GPT-4o Mini or GPT-4).
    """
    response = openai.ChatCompletion.create(
        model=model,  # Either "gpt-4o-mini" or "gpt-4"
        messages=[
            {"role": "system", "content": "You are a helpful TV show recommender."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response["choices"][0]["message"]["content"]


In [29]:
def generate_no_few_shot(model: str, user_query: str) -> str:
    """
    Basic recommendation request with no few-shot examples.
    """
    prompt = f"Recommend a TV show based on these preferences:\n{user_query}\nRecommendation:"
    return openai_generate(prompt, model=model)

In [30]:
def generate_few_shot(model: str, user_query: str) -> str:
    """
    Uses few-shot learning with examples before processing the user request.
    """
    example_1_in = "I want a crime drama with a tense storyline under 60 minutes."
    example_1_out = "I recommend 'Breaking Bad', which delivers suspense in ~47-min episodes."

    example_2_in = "Looking for a kids-friendly fantasy series around 30 minutes."
    example_2_out = "I recommend 'The Owl House', a fun fantasy cartoon with 22-min episodes."

    prompt = f"""
        Here are some examples of recommendations:

        Example 1:
        User: "{example_1_in}"
        Assistant: "{example_1_out}"

        Example 2:
        User: "{example_2_in}"
        Assistant: "{example_2_out}"

        Now the user wants an answer to this request:
        "{user_query}"
        Assistant:
        """
    return openai_generate(prompt, model=model)

In [32]:
def get_recommendations(user_query: str):
    """
    Returns recommendations for GPT-4o Mini and GPT-4, both pre and post few-shot prompting.
    """
    results = {}

    # GPT-4o Mini (No Few-Shot)
    results["gpt-4o-mini_no_fs"] = generate_no_few_shot("gpt-4o-mini", user_query)

    # GPT-4o Mini (Few-Shot)
    results["gpt-4o-mini_fs"] = generate_few_shot("gpt-4o-mini", user_query)

    # GPT-4 (No Few-Shot)
    results["gpt-4_no_fs"] = generate_no_few_shot("gpt-4", user_query)

    # GPT-4 (Few-Shot)
    results["gpt-4_fs"] = generate_few_shot("gpt-4", user_query)

    return results

# Example Usage
query_example = "I want a con-man show under 60 minutes for a 16-year-old."
outputs = get_recommendations(query_example)

# Print Results
for key, val in outputs.items():
    print(f"\n--- {key.upper()} ---\n{val}")



--- GPT-4O-MINI_NO_FS ---
I recommend **"Leverage."** This show features a team of con artists and thieves who use their skills to help those in need and take down corrupt individuals and organizations. Each episode is packed with clever schemes, twists, and a good dose of humor. The episodes are typically around 45 minutes long, making it perfect for your preference. It’s engaging, entertaining, and has a great mix of action and wit for a 16-year-old audience. Enjoy!

--- GPT-4O-MINI_FS ---
"I recommend 'Sneaky Pete', which follows a con artist who assumes the identity of his former cellmate. The episodes are around 50 minutes long and provide a mix of drama and clever schemes that are suitable for a 16-year-old."

--- GPT-4_NO_FS ---
"White Collar"

--- GPT-4_FS ---
"I recommend 'White Collar'. This show is about a con artist who works with an FBI agent to solve white-collar crimes. Each episode is approximately 40-50 minutes long and suitable for a 16-year-old."
