In [1]:
import pandas as pd

# Load the movie features dataset
movie_features = pd.read_csv("movie_features.csv")

# Display the first few rows
movie_features.head()

Unnamed: 0,tmdbId,title,year,average_rating,rating_count,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,862,Toy Story,1995,3.897,68997.0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,8844,Jumanji,1995,3.276,28904.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,15602,Grumpier Old Men,1995,3.139,13134.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,31357,Waiting to Exhale,1995,2.845,2806.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,11862,Father of the Bride Part II,1995,3.06,13154.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Remove rows with null values
movie_features.dropna(inplace=True)

In [3]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

numeric_features = ['year', 'average_rating', 'rating_count']

movie_features_without_name = movie_features.drop(columns=['title', 'tmdbId'])

ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    remainder='passthrough'
)

In [4]:
from sklearn.neighbors import NearestNeighbors
pipe = make_pipeline(
    ct,
    NearestNeighbors(n_neighbors=11, metric='euclidean', p=1, radius=10.0, algorithm='auto', leaf_size=3)  # Using Euclidean distance
)

pipe.fit(movie_features_without_name)

0,1,2
,steps,"[('columntransformer', ...), ('nearestneighbors', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('standardscaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,11
,radius,10.0
,algorithm,'auto'
,leaf_size,3
,metric,'euclidean'
,p,1
,metric_params,
,n_jobs,


In [5]:
import joblib
joblib.dump(pipe, 'movie_recommender_model.pkl')

['movie_recommender_model.pkl']

In [6]:
model = joblib.load('movie_recommender_model.pkl')

def find_similar_movies(movie_id, num_neighbors=5):
    # Get the movie row based on tmdbId from movie_features
    movie_row = movie_features[movie_features['tmdbId'] == movie_id]
    if movie_row.empty:
        print("Movie id not found.")
        return None

    # Prepare query features by dropping 'title' and 'tmdbId'
    query_features = movie_row.drop(columns=['title', 'tmdbId'])
    
    # Transform the query features using the pipeline's preprocessing step
    query_transformed = model.named_steps['columntransformer'].transform(query_features)
    
    # Retrieve neighbors using the NearestNeighbors model inside the pipeline
    distances, indices = model.named_steps['nearestneighbors'].kneighbors(query_transformed)

    # Remove the queried movie from the neighbors if it's present
    query_index = movie_row.index[0]
    neighbor_indices = [idx for idx in indices[0] if idx != query_index]
    neighbor_indices = neighbor_indices[:num_neighbors]
    
    # Return the similar movies as a list of tuples (tmdbId, title)
    similar_movies = movie_features.iloc[neighbor_indices][['tmdbId', 'title']]
    return list(similar_movies.itertuples(index=False, name=None))

In [8]:
find_similar_movies(movie_id=259720, num_neighbors=10)  # Example movie_id

[(84281, 'Hello I Must Be Going'),
 (284286, 'Learning to Drive'),
 (214075, 'Words and Pictures'),
 (253283, 'Take Care'),
 (127370, 'Unfinished Song (Song for Marion)'),
 (75969, 'Männerherzen... und die ganz ganz große Liebe'),
 (501902, 'Gewoon Vrienden'),
 (53256, 'Three (a.k.a. 3)'),
 (39285, 'Beauty & the Briefcase'),
 (461957, 'Catching Feelings')]