In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib

In [3]:
# Load the dataset
data = pd.read_csv('movie_metadata.csv')

In [4]:
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [5]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [6]:
data.shape

(5043, 28)

In [7]:
data.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      14
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [8]:
# Preprocessing
data.dropna(inplace=True)  # Remove rows with missing 'country' values
data.drop_duplicates(inplace=True)  # Remove duplicate rows

In [9]:
data.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [10]:
# Select relevant columns for recommendation
selected_columns = ['movie_title', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'country']
data = data[selected_columns]

In [11]:
# Combine selected columns into a single text column for TF-IDF vectorization
data['combined_features'] = data.apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [12]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_features'])

In [13]:
# Train the recommendation model
knn_model = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')
knn_model.fit(tfidf_matrix)

In [14]:
# Save the model and vectorizer
joblib.dump(knn_model, 'geographic_recommendation_model.pkl')
joblib.dump(tfidf_vectorizer, 'geographic_tfidf_vectorizer.pkl')

['geographic_tfidf_vectorizer.pkl']

In [16]:
def recommend_movies(movie_title, num_recommendations=5):
    # Load the trained model and TF-IDF vectorizer
    knn_model = joblib.load('geographic_recommendation_model.pkl')
    tfidf_vectorizer = joblib.load('geographic_tfidf_vectorizer.pkl')
    
    # Transform the input movie title into TF-IDF vector
    input_vector = tfidf_vectorizer.transform([movie_title])
    
    # Find the nearest neighbors
    distances, indices = knn_model.kneighbors(input_vector, n_neighbors=num_recommendations+1)
    
    # Get the indices of recommended movies (excluding the input movie)
    recommended_indices = indices.flatten()[1:]
    
    # Get the movie titles of recommended movies
    recommended_movies = data.iloc[recommended_indices]['movie_title'].tolist()
    
    return recommended_movies

# Example usage:
movie_title = "Rush Hour"
recommended_movies = recommend_movies(movie_title)
print("Recommended Movies:")
for i, movie in enumerate(recommended_movies, start=1):
    print(f"{i}. {movie}")


Recommended Movies:
1. Rush Hour 3 
2. Everything Must Go 
3. One Hour Photo 
4. 25th Hour 
5. Rush 
