DATA LOADING AND CLEANING

In [1]:
import numpy as np
import pandas as pd
import zipfile

In [2]:
zip_path = "D:/LLM_Based_Recommendation_System/archive.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())

    csv_filename = z.namelist()[0]
    df = pd.read_csv(z.open(csv_filename))

['TMDB_movie_dataset_v11.csv']


In [3]:
df.head(3)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."


In [4]:
# Remove rows where 'id' occurs more than once
id_counts = df['id'].value_counts()
df = df[~df['id'].isin(id_counts[id_counts > 1].index)]
df['id'].value_counts()

id
1434514    1
27205      1
157336     1
155        1
19995      1
          ..
11324      1
106646     1
99861      1
271110     1
49026      1
Name: count, Length: 1177042, dtype: int64

In [5]:
# Drop unwanted columns
df = df.drop(columns=['backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'spoken_languages'])

# Verify the changes
print(df.columns)

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'budget', 'original_language',
       'original_title', 'overview', 'popularity', 'tagline', 'genres',
       'production_companies', 'production_countries', 'keywords'],
      dtype='object')


In [6]:
import pandas as pd
import numpy as np

# Replace missing string values with empty strings
string_cols = ['title', 'original_title', 'overview', 'tagline', 'genres', 'keywords', 
               'status', 'original_language', 'production_companies', 'production_countries', 'release_date']

df[string_cols] = df[string_cols].fillna("")

# Replace missing numerical values with 0
num_cols = ['vote_average', 'vote_count', 'runtime', 'budget', 'revenue', 'popularity']
df[num_cols] = df[num_cols].fillna(0)

In [7]:
# Remove duplicate rows based on 'title' and 'release_date'
df = df.drop_duplicates(subset=['title', 'release_date'])

In [8]:
df.isna().sum()

id                      0
title                   0
vote_average            0
vote_count              0
status                  0
release_date            0
revenue                 0
runtime                 0
adult                   0
budget                  0
original_language       0
original_title          0
overview                0
popularity              0
tagline                 0
genres                  0
production_companies    0
production_countries    0
keywords                0
dtype: int64

In [9]:
df = df.head(10000)

In [10]:
# import pinecone
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv

# Get API key
load_dotenv()  # Loads variables from .env into environment
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

# Initialize Pinecone
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "movies-list"

# Create an index if it doesn't exist
if index_name not in pc.list_indexes():
    pc.create_index(index_name, dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) )  # Adjust based on embedding model

# Connect to the index
index = pc.Index(index_name)

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Small & fast model


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def format_movie_data(row):
    """Formats movie data into a string for embedding."""
    return (f"Title: {row['title']}\n"
            f"Original Title: {row['original_title']}\n"
            f"Overview: {row['overview']}\n"
            f"Tagline: {row['tagline']}\n"
            f"Genres: {row['genres']}\n"
            f"Keywords: {row['keywords']}\n"
            f"Status: {row['status']}\n"
            f"Release Date: {row['release_date']}\n"
            f"Runtime: {row['runtime']} minutes\n"
            f"Budget: ${row['budget']}\n"
            f"Revenue: ${row['revenue']}\n"
            f"Vote Average: {row['vote_average']}\n"
            f"Vote Count: {row['vote_count']}\n"
            f"Popularity: {row['popularity']}\n"
            f"Original Language: {row['original_language']}\n"
            f"Adult: {'Yes' if row['adult'] else 'No'}\n"
            f"Production Companies: {row['production_companies']}\n"
            f"Production Countries: {row['production_countries']}"
           )

# Convert each movie into an embedding and store in Pinecone
for idx, row in df.iterrows():
    text_data = format_movie_data(row)
    embedding = embedding_model.encode(text_data).tolist()  # Convert to list
    metadata = {
        "title": row["title"],
        "original_title": row["original_title"],
        "overview": row["overview"],
        "tagline": row["tagline"],
        "genres": row["genres"],
        "keywords": row["keywords"],
        "release_date": row["release_date"],
        "vote_average": row["vote_average"],
        "popularity": row["popularity"],
    }

    # Store in Pinecone
    index.upsert([(str(row['id']), embedding, metadata)])  # Unique ID for each movie

KeyboardInterrupt: 