# Movie Recomendation System

In [1]:
# Import libraries
import numpy as np # For mathematical
import pandas as pd # For data visusalization

import ast # Import the 'ast' module, which provides tools to work with Python's Abstract Syntax Trees.

from sklearn.feature_extraction.text import CountVectorizer # For vectorization
from sklearn.metrics.pairwise import cosine_similarity # Importing cosine_similarity to compute the similarity between two vectors (e.g., two documents)

import nltk # Importing the Natural Language Toolkit library, commonly used for text preprocessing tasks
from nltk.stem.porter import PorterStemmer # Importing the PorterStemmer from nltk, which reduces words to their root form (e.g., "running" -> "run")

In [2]:
df = pd.read_csv('Movie/tmdb_5000_movies.csv')
df2 = pd.read_csv('Movie/tmdb_5000_credits.csv')

# Data PreProcessing

In [3]:
# Merging two data frame into a one data frame
df = df.merge(df2, on='title') # Two data frame merging into the basis of 'title' column

In [4]:
# Remove ecessary except genres, movie_id, keywords, title, overviwe, cast, crew
df = df[['movie_id', 'title',  'overview', 'genres', 'keywords', 'cast', 'crew']]

In [5]:
# Dropping the missing value
df.dropna(inplace=True)

In [6]:
# PreProcessing of 'genres' column
# only evaluates safe literals like strings, numbers, lists, dicts, tuples, booleans, and
# Changing format
def convert(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

# Taking the 1st 3 name info
def convert3(obj):
    return [i['name'] for i in ast.literal_eval(obj)[:3]]

# Taking the director
def fetch_director(obj):
    return [i['name'] for i in ast.literal_eval(obj) if i.get('job') == 'Director'][:1]

# The output of converted columns
df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)
df['cast'] = df['cast'].apply(convert3)
df['crew'] = df['crew'].apply(fetch_director)

# Overview column which is in string format converting a list format
df['overview'] = df['overview'].apply(lambda x:x.split())

In [7]:
# Removing white space
df['genres'] = df['genres'].apply(lambda x:[i.replace(' ', '') for i in x])
df['keywords'] = df['keywords'].apply(lambda x:[i.replace(' ', '') for i in x])
df['cast'] = df['cast'].apply(lambda x:[i.replace(' ', '') for i in x])
df['crew'] = df['crew'].apply(lambda x:[i.replace(' ', '') for i in x])

In [8]:
# Making a new column where we concatinate 'overview', 'genres', 'keywords', 'cast', 'crew' this columns
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [9]:
# Removing columns
df.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'], inplace=True)

In [10]:
# Converting a list into a strings
df['tags'] = df['tags'].apply(lambda x:' '.join(x))

In [11]:
# Converting all charecters in lowercase
df['tags'] = df['tags'].apply(lambda x:x.lower())

# Vectorization

In [12]:
# Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')

In [13]:
vectors = cv.fit_transform(df['tags']).toarray()

In [14]:
# Creating a obj
ps = PorterStemmer()

In [15]:
# Creating a helper function
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return ' '.join(y)

In [16]:
df['tags'] = df['tags'].apply(stem)

In [17]:
# Passing the vectors in cosine_similarity
similarity = cosine_similarity(vectors)

In [18]:
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(df.iloc[i[0]].title)

In [19]:
recommend("Avatar")

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem


# This project builds a content-based movie recommendation system using the TMDB dataset. It analyzes features such as genres, cast, crew, and keywords to compute similarity between movies. By combining these features into a single "tags" column and applying TF-IDF vectorization with cosine similarity, the system recommends movies that are most similar to a given title. The project includes data preprocessing, feature engineering, and saving the model components for reuse.