# Collaborative Filtering

## 1. Exploratory Data Analysis

In [1]:
import pandas as pd

In [2]:
movie_df = pd.read_csv('data/movie.csv')

In [3]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating_df = pd.read_csv('data/rating.csv')

In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
tags_df = pd.read_csv('data/tag.csv')

In [7]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [8]:
link_df = pd.read_csv('data/link.csv')

In [9]:
link_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [10]:
genome_tags_df = pd.read_csv('data/genome_tags.csv')

In [11]:
genome_tags_df.head(10)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
5,6,1950s
6,7,1960s
7,8,1970s
8,9,1980s
9,10,19th century


In [12]:
genome_tags_df = pd.read_csv('data/genome_scores.csv')

In [13]:
genome_tags_df.head(10)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675
5,1,6,0.217
6,1,7,0.067
7,1,8,0.26275
8,1,9,0.262
9,1,10,0.032


In [14]:
# TITLE ANALYSIS.
movie_df['title'].unique()[:10]

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)',
       'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)',
       'Sudden Death (1995)', 'GoldenEye (1995)'], dtype=object)

## 2. Data Preprocessing

In [15]:
df = pd.merge(movie_df, rating_df, on = 'movieId') 

In [16]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [17]:
df = pd.merge(df, link_df, on = 'movieId')

In [18]:
df = df.dropna(subset=['tmdbId'])  # Remove rows where tmdbId is NaN
df['tmdbId'] = df['tmdbId'].astype(int)

In [19]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47,114709,862
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52,114709,862
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51,114709,862
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47,114709,862
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41,114709,862


In [20]:
# DATA CLEANING
df = df.drop('timestamp', axis = 1)
df = df.drop('imdbId', axis = 1)

In [21]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,862
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,862
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,862
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,862
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,862


In [22]:
# REPLACE '|' WITH ' '
df['genres'] =  df['genres'].apply(lambda x: x.replace("|", " "))

In [23]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tmdbId
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3,4.0,862
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,6,5.0,862
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,8,4.0,862
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,10,4.0,862
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,11,4.5,862


In [24]:
df['genres'].unique()[:50]

array(['Adventure Animation Children Comedy Fantasy',
       'Adventure Children Fantasy', 'Comedy Romance',
       'Comedy Drama Romance', 'Comedy', 'Action Crime Thriller',
       'Adventure Children', 'Action', 'Action Adventure Thriller',
       'Comedy Horror', 'Adventure Animation Children', 'Drama',
       'Action Adventure Romance', 'Crime Drama', 'Drama Romance',
       'Action Comedy Crime Drama Thriller', 'Comedy Crime Thriller',
       'Crime Drama Horror Mystery Thriller', 'Drama Sci-Fi',
       'Children Drama', 'Adventure Drama Fantasy Mystery Sci-Fi',
       'Mystery Sci-Fi Thriller', 'Adventure Romance IMAX',
       'Documentary IMAX', 'Children Comedy', 'Drama War',
       'Action Crime Drama', 'Action Adventure Fantasy',
       'Comedy Drama Thriller', 'Mystery Thriller',
       'Animation Children Drama Musical Romance',
       'Crime Mystery Thriller', 'Action Drama Thriller',
       'Adventure Drama', 'Drama Mystery', 'Drama Thriller',
       'Comedy Crime', 'Acti

In [25]:
# STORE DATA(AS DATA-FRAME) IN A PICKLE-FILE.
import pickle

# OPEN THE FILE IN WRITE-BINARY MODE.
with open('artifacts/rating_data.pkl', 'wb') as f:
    pickle.dump(df, f)

In [26]:
import ast
# FUNCTION TO EXTRACT GENRES AND KEYWORD NAMES
def extract_name(obj):
    # Convert stringified list to Python object
    if isinstance(obj, str):  # Check if obj is a string
        obj = ast.literal_eval(obj)  # Safely convert string to Python list of dictionaries
    
    # Handle empty lists or None values
    if not obj:  # Check if obj is empty
        return []
    
    # Extract 'name' from each dictionary in the list
    names = [i['name'] for i in obj]
    return names


In [27]:
# PREPARE DATA FOR GENRES.
movies_df = pd.read_csv('data/tmdb_5000_movies.csv') 

movies_df['genres'] = movies_df['genres'].apply(extract_name) 
movies_df = movies_df[['id', 'original_title', 'genres']]

In [28]:
new_df = df[['tmdbId', 'title']]

In [29]:
genres_df = pd.merge(new_df, movies_df, left_on = 'tmdbId', right_on = 'id')

In [30]:
genres_df.head()

Unnamed: 0,tmdbId,title,id,original_title,genres
0,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
1,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
2,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
3,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
4,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"


In [31]:
# CONSIDER ONE GENRE FROM LIST OF GENERES.
genres_df['genres'] = genres_df['genres'].apply(lambda x: x[0] if isinstance(x, list) and x else None)

In [32]:
genres_df = genres_df[['id', 'genres', 'original_title']]

In [33]:
genres_df.head()

Unnamed: 0,id,genres,original_title
0,862,Animation,Toy Story
1,862,Animation,Toy Story
2,862,Animation,Toy Story
3,862,Animation,Toy Story
4,862,Animation,Toy Story


In [34]:
genres_df['genres'].unique()

array(['Animation', 'Adventure', 'Comedy', 'History', 'Action', 'Drama',
       'Crime', 'Fantasy', 'Music', 'Horror', 'Thriller', 'Romance',
       'Science Fiction', 'Documentary', 'Family', 'War', 'Western',
       'Mystery', None, 'TV Movie', 'Foreign'], dtype=object)

In [36]:
genres_df['original_title'].unique()

array(['Toy Story', 'GoldenEye', 'The American President', ...,
       'The Gunman', 'Escobar: Paradise Lost', 'Get Hard'],
      shape=(4219,), dtype=object)

In [35]:
# STORE GENRES-DATA
with open('artifacts/genres_data.pkl', 'wb') as f:
    pickle.dump(genres_df, f)