## Data Processing & Model Building

#### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import ast # To safely evaluate string literals as Python objects   

#### Load and Merge Data

In [3]:
# Load the datasets
movies = pd.read_csv('../tmdb_5000_movies.csv')
credits = pd.read_csv('../tmdb_5000_credits.csv')

# Merge them on the 'title' column
movies = movies.merge(credits, on='title')

# Display the first few rows
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### Data Cleaning and Preprocessing

In [4]:
# Select relevant columns for the recommender
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

#### Helper Functions to Process JSON-like Columns

In [5]:
# Helper function to extract names from the JSON-like strings
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# Helper function to get the top 3 cast members
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

# Helper function to fetch the director's name
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

#### Apply Helper Functions

In [6]:
# Apply the functions to the respective columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

# Display the processed columns
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


#### Create the 'tags' Column

In [7]:
# Remove spaces from the names
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Convert overview from string to list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Create the 'tags' column by concatenating the processed columns
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame with just the essential columns
new_df = movies[['movie_id', 'title', 'tags']]

# Convert the 'tags' list back to a string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert tags to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


#### Text Vectorization

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
# max_features: use the 5000 most frequent words
# stop_words: remove common English words (like 'the', 'a', 'in')
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the tags into a vector matrix
vectors = cv.fit_transform(new_df['tags']).toarray()

#### Calculate Cosine Similarity

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity matrix
similarity = cosine_similarity(vectors)

#### Create the Recommendation Function

In [10]:
def recommend(movie):
    # Find the index of the movie
    movie_index = new_df[new_df['title'] == movie].index[0]
    
    # Get the similarity scores for that movie
    distances = similarity[movie_index]
    
    # Sort the movies based on similarity, keeping the index
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Print the titles of the top 5 movies
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

# Test the function
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem


#### Export the Model and Data

In [11]:
import pickle

# Create an 'artifacts' folder if it doesn't exist
import os
if not os.path.exists('../artifacts'):
    os.makedirs('../artifacts')

# Save the DataFrame and the similarity matrix
pickle.dump(new_df.to_dict(), open('../artifacts/movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('../artifacts/similarity.pkl', 'wb'))

print("Files exported successfully!")

Files exported successfully!
