In [1]:
import pandas as pd
from thefuzz import process

def merging_dataset():
    results_with_crew = pd.read_csv("results_with_crew.csv")
    imdb_movies = pd.read_csv("imdb_movies.csv")

    imdb_titles = imdb_movies['names'].dropna().unique().tolist()

    matches = results_with_crew['primaryTitle'].apply(
        lambda x: process.extractOne(x, imdb_titles, score_cutoff=80)
    )

    results_with_crew['matched_name'] = matches.apply(lambda x: x[0] if x else None)
    results_with_crew['match_score'] = matches.apply(lambda x: x[1] if x else None)

    merged = results_with_crew.merge(imdb_movies, left_on='matched_name', right_on='names', how='left')

    merged.to_csv("merged_results.csv", index=False)

In [2]:
df =  pd.read_csv('merged_results.csv')

In [3]:
import http.client
import json
from urllib.parse import quote

def add_new_movie(movie_id, current_df):
    conn = http.client.HTTPSConnection("imdb236.p.rapidapi.com")
    headers = {
        'x-rapidapi-key': "74af810398msh57ffd38221bc0b0p1d3666jsnc105598ca135",
        'x-rapidapi-host': "imdb236.p.rapidapi.com"
    }
    
    conn.request("GET", f"/api/imdb/{quote(movie_id)}", headers=headers)
    res = conn.getresponse()
    data = res.read()
    response_data = json.loads(data.decode("utf-8"))

    if 'error' in response_data:
        print(f"Error from API: {response_data.get('error')}")
        print(f"Could not find movie with ID: {movie_id}. DataFrame not changed.")
        return current_df

    new_df = pd.json_normalize(response_data)
    print("Movie found, adding to DataFrame...")
    
    updated_df = pd.concat([current_df, new_df.rename(columns={'id': 'tconst', 'url': 'IMDbLink'})], ignore_index=True)
    return updated_df


In [4]:
mov_id = input("Enter your movie ID (e.g., tt0133093): ")
searched = df[df['tconst'] == mov_id]
if searched.empty:
    print(f"Movie with ID {mov_id} not found in DataFrame. Adding it now...")
    df = add_new_movie(mov_id, df)
    movie_title_string = searched['primaryTitle'].iloc[0]
else:
    movie_title_string = searched['primaryTitle'].iloc[0]
    print(f"Movie '{movie_title_string}' already exists in the DataFrame.")

Movie 'The Matrix' already exists in the DataFrame.


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df['genres'] = df['genres'].fillna('')
df['directors'] = df['directors'].fillna('')
df['writers'] = df['writers'].fillna('')

In [6]:
def clean_names(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return [str.lower(i.replace(" ", "")) for i in x.split(',')]
        else:
            return []

In [7]:
df['directors_clean'] = df['directors'].astype(str).apply(clean_names)
df['writers_clean'] = df['writers'].astype(str).apply(clean_names)
df['genres_clean'] = df['genres'].astype(str).apply(lambda x: [str.lower(i.replace(" ", "")) for i in x.split(',')])

def create_soup(x):
    return ' '.join(x['writers_clean']) + ' ' + ' '.join(x['directors_clean']) + ' ' + ' '.join(x['genres_clean'])

df['features_soup'] = df.apply(create_soup, axis=1)

In [8]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features_soup'])
print("Shape of TF-IDF Matrix:")
print(tfidf_matrix.shape)

Shape of TF-IDF Matrix:
(5787, 8415)


In [9]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Shape of Cosine Similarity Matrix:")
print(cosine_sim.shape)

Shape of Cosine Similarity Matrix:
(5787, 5787)


In [10]:
indices = pd.Series(df.index, index=df['primaryTitle']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, data=df):
    if title not in indices:
        return f"Error: Movie '{title}' not found in the dataset."
    
    idx = indices[title]
    indices_series = indices[title]
    if isinstance(indices_series, pd.Series):
        idx = indices_series.iloc[0]
    else:
        idx = indices_series
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return data['primaryTitle'].iloc[movie_indices]

In [11]:
print(f"Recommendations for '{movie_title_string}':")
print(get_recommendations(movie_title_string))

Recommendations for 'The Matrix':
2488              The Matrix Reloaded
4720           The Matrix Revolutions
2218                            Bound
1598                      Cloud Atlas
166                    V for Vendetta
3329                      Cloverfield
117                     Jurassic Park
5170    The Lost World: Jurassic Park
4627                       Rollerball
4628                       Rollerball
Name: primaryTitle, dtype: object
