In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [2]:
movies = pd.read_csv("datasets/tmdb_movie_v11.csv")

In [3]:
movies.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [4]:
movies = movies[movies["original_language"] == "en"]

In [5]:
movies = movies[movies["spoken_languages"].str.contains("English", na=False, case=False)]

In [6]:
movies = movies[movies["poster_path"] != ""]

In [7]:
df = movies.drop(columns=["vote_count", "status", "budget", "homepage", "original_language", "imdb_id", "title", "overview", "backdrop_path", "tagline", "spoken_languages", "production_countries"])

In [8]:
df.shape[0]

257095

In [9]:
df.head()

Unnamed: 0,id,vote_average,release_date,revenue,runtime,adult,original_title,popularity,poster_path,genres,production_companies,keywords
0,27205,8.364,2010-07-15,825532764,148,False,Inception,83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","rescue, mission, dream, airplane, paris, franc..."
1,157336,8.417,2014-11-05,701729206,169,False,Interstellar,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","rescue, future, spacecraft, race against time,..."
2,155,8.512,2008-07-16,1004558444,152,False,The Dark Knight,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","joker, sadism, chaos, secret identity, crime f..."
3,19995,7.573,2009-12-15,2923706026,162,False,Avatar,79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","future, society, culture clash, space travel, ..."
4,24428,7.71,2012-04-25,1518815515,143,False,The Avengers,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"Science Fiction, Action, Adventure",Marvel Studios,"new york city, superhero, shield, based on com..."


In [10]:
def split_string(x):
    if isinstance(x, str):
        return x.split(",")
    return []

In [11]:
df["genres"] = df["genres"].apply(split_string)
df["keywords"] = df["keywords"].apply(split_string)
df["production_companies"] = df["production_companies"].apply(split_string)

In [12]:
mlb = MultiLabelBinarizer(sparse_output=True)

genres_binary = mlb.fit_transform(df["genres"])

df["keywords_str"] = df["keywords"].apply(lambda x: " ".join(x))
tfidf = TfidfVectorizer(max_features=5000)
keywords_tfidf = tfidf.fit_transform(df["keywords_str"])

production_companies_binary = mlb.fit_transform(df["production_companies"])

In [13]:
scaler = MinMaxScaler()

runtime_filled = df["runtime"].fillna(df["runtime"].median()).values.reshape(-1, 1)
runtime_scaled = scaler.fit_transform(runtime_filled)

release_date = pd.to_datetime(df["release_date"], errors="coerce")
release_year = release_date.dt.year
median_year = release_year.median()
release_year_filled = release_year.fillna(median_year).astype("int16")
release_year_scaled = scaler.fit_transform(release_year_filled.values.reshape(-1, 1))

popularity_filled = df["popularity"].fillna(df["popularity"].median()).values.reshape(-1, 1)
popularity_scaled = scaler.fit_transform(popularity_filled)

revenue_scaled = scaler.fit_transform(df["revenue"].values.reshape(-1, 1))

vote_average_scaled = scaler.fit_transform(df["vote_average"].values.reshape(-1, 1))

adult_scaled = scaler.fit_transform(df["adult"].astype(np.float32).values.reshape(-1, 1))

In [14]:
numerical_features = [
    popularity_scaled,
    revenue_scaled,
    runtime_scaled,
    vote_average_scaled,
    adult_scaled,
    release_year_scaled,
]
numerical_features_sparse = [csr_matrix(feat) for feat in numerical_features]

In [15]:
feature_matrix = hstack(
    [genres_binary, keywords_tfidf, production_companies_binary] + numerical_features_sparse
)

In [16]:
nn_cosine = NearestNeighbors(metric="cosine", n_neighbors=20).fit(feature_matrix)

In [17]:
joblib.dump((df, feature_matrix, nn_cosine), "models/mrs_model_v11.pkl.gz", compress=("gzip", 3))

['models/mrs_model_v11.pkl.gz']