In [2]:
import numpy as np
import pandas as pd

movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# combining the two datasets
movies = movies.merge(credits, on = 'title')

# Choosing required columns - eliminated all numeric columns and columns that dont contribute to recommending on the basis of content
# genres, id, keywords, title, overview, cast, crew
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

movies.isnull().sum()
movies.dropna(inplace=True)
movies.duplicated().sum()

#we use this function to extract the ‘name’ value from dictionaries present in ‘genres’ and ‘keywords’
import ast
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L


movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
# for extracting the first 3 names of cast
def convert3(obj):
  L = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter!= 3:
      L.append(i['name'])
      counter = counter + 1
    else:
      break
  return L
movies['cast'] = movies['cast'].apply(convert3)

# for extracting the ‘director’ value from ‘crew’ attribute
def fetch_director(obj):
  L = []
  for i in ast.literal_eval(obj):
    if(i['job'] == 'Director'):
      L.append(i['name'])
      break
  return L
movies['crew'] = movies['crew'].apply(fetch_director)

# performing final pre processing
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

# creating ‘tags’ and new_df

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id', 'title', 'tags']]
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


# Stemming

In [3]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

new_df['tags'] = new_df['tags'].apply(stem)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


# Text Vectorization

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
cv.get_feature_names_out()


array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

# Cosine Similarity

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

# recommendation function
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]

  for i in movies_list:
    print(new_df.iloc[i[0]].title)


# Transferring using pickle

In [6]:
import pickle

#because to use pickle we cant directly transfer pandas dataframe
new_df = new_df.to_dict()
pickle.dump(new_df, open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

