This program was used for data cleaning and preprocessing and experimental purposes.

In [2]:
# All project imports

import pandas as pd
from langchain_groq import ChatGroq
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

In [3]:
# Importing the database
movies = pd.read_csv('TMDB_movie_dataset_v11.csv')

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087924 entries, 0 to 1087923
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1087924 non-null  int64  
 1   title                 1087911 non-null  object 
 2   vote_average          1087924 non-null  float64
 3   vote_count            1087924 non-null  int64  
 4   status                1087924 non-null  object 
 5   release_date          927082 non-null   object 
 6   revenue               1087924 non-null  int64  
 7   runtime               1087924 non-null  int64  
 8   adult                 1087924 non-null  bool   
 9   backdrop_path         294442 non-null   object 
 10  budget                1087924 non-null  int64  
 11  homepage              116758 non-null   object 
 12  imdb_id               592323 non-null   object 
 13  original_language     1087924 non-null  object 
 14  original_title        1087911 non-

I will be using the following columns for this project:

1) title
2) status (only to remove the movies that have not been released, after that it will be dropped)
3) release date
4) adult
5) original_language
6) poster_path
7) genres

In [4]:
movies = movies[['title', 'status', 'release_date', 'adult', 'original_language', 'poster_path', 'genres']] # Extracting only the necessary columns
movies = movies[~movies['status'].isin(['In Production', 'Post Production', 'Planned', 'Rumored'])] # Remvoving all movies that are not released
movies = movies[['title', 'release_date', 'adult', 'original_language', 'poster_path', 'genres']] # Selecting the final columns that will be used

In [5]:
# Printing the number of null values
movies.isna().sum().sort_values()

adult                     0
original_language         0
title                    12
release_date         145477
poster_path          316231
genres               418120
dtype: int64

In [6]:
"""Removing all data entries that don't have a title, release data, 
poster path and genre as that can cause issues when generating recommendations
"""
movies = movies.dropna(subset=['title', 'release_date', 'poster_path', 'genres'])

In [7]:
movies.isna().sum().sort_values()

title                0
release_date         0
adult                0
original_language    0
poster_path          0
genres               0
dtype: int64

In [9]:
# Convert the 'release_date' column from string to datetime
movies['release_date'] = pd.to_datetime(movies['release_date'], format='%Y-%m-%d', errors='coerce')

# Extract the year and create a new column 'release_year'
movies['release_year'] = movies['release_date'].dt.year

# Drop the original 'release_date' column as it is not needed
movies = movies.drop(columns=['release_date'])

In [19]:
# Filtering movies from 1960 to 2024
movies = movies[(movies['release_year'] >= 1960) & (movies['release_year'] <= 2024)]

In [21]:
movies.shape # Total number of data entries we'll be working with

(448784, 6)

In [20]:
# The final dataset that we will be using

movies

Unnamed: 0,title,adult,original_language,poster_path,genres,release_year
0,Inception,False,en,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,"Action, Science Fiction, Adventure",2010
1,Interstellar,False,en,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,"Adventure, Drama, Science Fiction",2014
2,The Dark Knight,False,en,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,"Drama, Action, Crime, Thriller",2008
3,Avatar,False,en,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"Action, Adventure, Fantasy, Science Fiction",2009
4,The Avengers,False,en,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"Science Fiction, Action, Adventure",2012
...,...,...,...,...,...,...
1087904,No Name and Dynamite,False,en,/7MVENxKbO925bTm6IUb4xtZ65NZ.jpg,Western,2022
1087906,Flower Goddess,False,ja,/ma325mGyL2rH20ebhGDlJpQByEg.jpg,"Animation, Fantasy",2020
1087908,Happy Zoo,False,ja,/eUIAPOSjGqfqzbvhDrubykYKOgd.jpg,Animation,2019
1087916,Friend,False,ja,/vOpxXL1hgyCQ5luxrX5UtVMQSQD.jpg,Animation,2019


In [23]:
# Exporting the final dataset

movies.to_csv('cleaned_movies.csv', index=False)

In [63]:
# Initializing the LLM

llm = ChatGroq(
    temperature=0.5,
    api_key='YOUR_API_KEY', # type: ignore
    model='llama3-70b-8192',
    stop_sequences=None,
    max_tokens=8000
)

In [64]:
# Creating an agent to interact with the pandas DataFrame
def filter_movies(genres="any", year="any year", original_language="any", adult="False"):
    # Instantiate the agent
    agent = create_pandas_dataframe_agent(llm, 
                                        movies,
                                        verbose=False, 
                                        allow_dangerous_code=True)

    # Set the query based on user filters
    if genres.lower() == "any" and year == "any year" and original_language == "any" and adult == "False":   
        query = "Select up to 5 random movies with 'adult' set to 'False' and return their 'title' and 'poster_path' in a JSON format with no explaination. NO PREAMBLE."
    elif genres.lower() == "any" and year == "any year" and original_language == "any" and adult == "True":
        query = "Select up to 5 random movies with 'adult' set to 'True' and return their 'title' and 'poster_path' in a JSON format with no explaination. NO PREAMBLE."
    else:
        query = f"Select up to 5 movies and return their 'title' and 'poster_path' in a JSON format with no preamble where {genres} are in the 'genres', 'release_year' is between {year}, 'original_language' is {original_language} and 'adult' is set to {adult}."

    result = agent.run(query)
    return result

In [35]:
output1 = filter_movies()
print(output1)

[{"title":"Homecoming","poster_path":"\/lDKLiRiE0ucuZs5SZJKZIt14A7L.jpg"},{"title":"D is for Division","poster_path":"\/cEbBDYXOtHuUOce8mOAFNF2kFKr.jpg"},{"title":"Explosive Mission","poster_path":"\/zMhNN4DQjV7AZgNUf6xa1DVj66d.jpg"},{"title":"Sharks vs. the World","poster_path":"\/eHRv27VB1XPqQe311MEP0S13EXu.jpg"},{"title":"D' Muetter wott nur s' Bescht","poster_path":"\/l2impqw9iN435DaaFONOICBCnkk.jpg"}]


In [36]:
output2 = filter_movies(adult="True")
print(output2)

[{"title":"The Velvet Edge","poster_path":"\/4ofS2l3CZPTNPt3VEnzymC5bZB1.jpg"},{"title":"I Could See Her Nipples!! A Married Woman From the Neighborhood","poster_path":"\/olhPdh1IoyiBdBwyjw8OWDQijCE.jpg"},{"title":"Not Airplane XXX: Cockpit Cuties","poster_path":"\/1NTDEHWaTaLdD1pAsoiu5CGPvGz.jpg"},{"title":"Rekindling the Flame","poster_path":"\/2319jELyXtqHDPEZig4PibntuI8.jpg"},{"title":"She's On The Cheerleading Squad At A Prestigious University! Four Years Of Competition, Ranked 8th In The Country! This College Girl's So Beautiful It's Painful - A Real Life Athlete Makes Her Porn Debut With Her Legs Spread Impossibly Wide!","poster_path":"\/kviYOuQsfPZgljvArrZdzuMulaj.jpg"}]


In [65]:
output3 = filter_movies(genres="Action, Drama, Thriller", year="2010-2020", original_language="en", adult="True")
print(output3)

[{"title":"The Dark Knight Rises","poster_path":"\/hr0L2aueqlP2BYUblTTjmtn0hw4.jpg"},{"title":"World War Z","poster_path":"\/1SWBSYJsnyhdNRfLI1T6RsCxAQ4.jpg"},{"title":"Rise of the Planet of the Apes","poster_path":"\/cjLsuP75UDlRdJVMXzXg3TJ4umX.jpg"},{"title":"Dawn of the Planet of the Apes","poster_path":"\/kScdQEwS9jPEdnO23XjGAtaoRcT.jpg"},{"title":"Elysium","poster_path":"\/aRjuJuPXHtVs6YegfeeQWXGRs1E.jpg"}]
