In [None]:
# importing numpy and pandas
import numpy as np
import pandas as pd
import sklearn as sk

In [None]:
# reading both csv's data and saving in a variables named movie and credit
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
# printing top 5 rows of movie csv
movies.head()

In [None]:
# printing top 5 rows of credit 
credit.head()

In [None]:
# merging both db based on title
movies = movies.merge(credit, on='title')

In [None]:
type(movies)


In [None]:
print(movies.columns)


In [None]:
movies = movies[['genres', 'title', 'overview', 'keywords', 'crew', 'cast', 'movie_id']]

In [None]:
#check missing data

movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
# check duplicate data
movies.duplicated().sum()

In [None]:
print(type(movies['genres'].iloc[0]))
print(movies['genres'].iloc[0])


In [None]:
# data cleaning of genres
movies.iloc[0].genres

In [None]:
# this is helper function to clean genre column and get only genre name 
# ast.literal_eval is used for changing string indices in integer
import ast

def convert(obj):
    if isinstance(obj, str):
        try:
            data = ast.literal_eval(obj)
            return [d['name'] for d in data]  # extract "name" field
        except:
            return []
    elif isinstance(obj, list):
        return [d['name'] if isinstance(d, dict) else d for d in obj]
    else:
        return []
  

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
# helper function fot top 3 cast

def convert_cast(obj):
    if isinstance(obj, str):
        try:
            data = ast.literal_eval(obj)
            return [d['name'] for d in data[:3]] 
        except:
            return []
    elif isinstance(obj, list):
        return [d['name'] if isinstance(d, dict) else d for d in obj[:3]]
    else:
        return []
  


In [None]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
# helper function to extractt director name from crew

def director(obj):
    try:
        data = ast.literal_eval(obj) if isinstance(obj, str) else obj
        for d in data:
            if d.get('job') == 'Director':
                return d['name']
        return None
    except Exception as e:
        return None




In [None]:
movies['crew'] = movies['crew'].apply(director)

In [None]:
# change overview colunm from string to list
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
#removing spaces
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
# movies['crew'] = movies['crew'].apply(lambda x: ["".join(x).replace(" ", "")])
movies['crew'] = movies['crew'].apply(
    lambda x: x.replace(" ", "") if isinstance(x, str) else ""
)



In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['cast'] + movies['crew'] 

In [None]:
new_df = movies[['title', 'tags', 'movie_id']]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df['tags'][2]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [None]:
new_df.head()

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)    

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
cv.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [None]:
similarity[1]

In [187]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [193]:
recommend('Iron Man')

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
The Helix... Loaded
Teenage Mutant Ninja Turtles II: The Secret of the Ooze


In [184]:
new_df.iloc[942].title

'The Book of Life'

In [195]:
import pickle

In [196]:
pickle.dump(new_df, open('movies.pkl','wb'))

In [197]:
pickle.dump(similarity,open('similarity.pkl','wb'))