In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
movies=pd.read_csv('merged_movie.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2648 entries, 0 to 2647
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                2648 non-null   int64  
 1   keywords              2546 non-null   object 
 2   overview              2648 non-null   object 
 3   popularity            2648 non-null   float64
 4   production_companies  2588 non-null   object 
 5   production_countries  2624 non-null   object 
 6   release_date          2648 non-null   object 
 7   revenue               2648 non-null   int64  
 8   runtime               2648 non-null   float64
 9   status                2648 non-null   object 
 10  tagline               2394 non-null   object 
 11  title                 2648 non-null   object 
 12  vote_average          2648 non-null   float64
 13  vote_count            2648 non-null   int64  
 14  movie_id              2648 non-null   float64
 15  cast_names           

# unsupervised-learning recommendation

In [3]:
# content-based recommendation
# step 1: select 'genre','keywords','cast_names','crew_names' to combine as a feature.
movies['combined_features'] = (
    movies['genres'].fillna('') + ' ' + 
    movies['cast_names'].fillna('') + ' ' +
    movies['keywords'].fillna('')
     
)
# step 2: calculate the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['combined_features'])

# step 3: calculate the similarity of cosine
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# step 4: recommendation based on the movie title
def recommend_movies(movie_title, num_recommendations):
    # get the target movie index. Here it locates the first movie.
    movie_idx = movies[movies['title'] == movie_title].index[0]
    
    #  get the similarity score with this movie
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # recommendate similar movies
    sim_scores = sim_scores[1:num_recommendations + 1]
    movie_indices = [idx for idx, score in sim_scores]
    suggest_idx = [score[0] for score in sim_scores]
    suggest_score = [score[1] for score in sim_scores]
    recommend_movies = movies.iloc[suggest_idx]['title']
    return pd.DataFrame({'Title': recommend_movies, 'Similarity Score': suggest_score})

# sample
recommendations = recommend_movies('Avatar', 5)
recommendations

Unnamed: 0,Title,Similarity Score
2023,Alien,0.141062
2520,Silent Running,0.1386
218,Planet of the Apes,0.137064
1603,Aliens,0.136016
190,Gravity,0.128836


In [4]:
# KNN recommendation

# step 1: select features
movie_features = movies[['vote_average', 'vote_count', 'popularity']].fillna(0)

# step 2: standardize the feature
scaler = StandardScaler()
scaled_features = scaler.fit_transform(movie_features)

# step 3: use cosine value to define the neighbours
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(scaled_features)

# step 4 : define the function
def knn_recommend(movie_title, n_neighbors=5):
    movie_idx = movies[movies['title'] == movie_title].index[0]
    distances, indices = knn.kneighbors([scaled_features[movie_idx]], n_neighbors=n_neighbors + 1)
    recommended_movies = movies.iloc[indices[0][1:]][['title', 'vote_average', 'vote_count', 'popularity']]
    return recommended_movies

# sample
knn_recommend('Avatar', 5)


Unnamed: 0,title,vote_average,vote_count,popularity
35,World War Z,6.7,5560,81.834855
1007,Resident Evil,6.4,2065,40.715623
113,White House Down,6.4,1891,39.004588
104,Thor,6.6,6525,86.493424
21,Skyfall,6.9,7604,93.004993


In [5]:
# score-genres based filter
# calculate the weighted score
m = movies['vote_count'].quantile(0.75)  # use the 0.75 quantile as the minimum
C = movies['vote_average'].mean()  

def weighted_rating(row, m=m, C=C):
    v = row['vote_count']
    R = row['vote_average']
    return (v / (v + m) * R) + (m / (v + m) * C)

movies['weighted_rating'] = movies.apply(weighted_rating, axis=1)


def get_score_based_recommendations(genre=None, data=movies, top_n=10):
    if genre:
        filtered_data = data[movies['genres'].str.contains(genre, case=False, na=False)]
    else:
        filtered_data = data
    top_movies = filtered_data.sort_values('weighted_rating', ascending=False).head(top_n)
    return top_movies[['title', 'vote_average', 'vote_count', 'weighted_rating']]

# sample
print("Top 10 Weighted_rating:")
print(get_score_based_recommendations())
print("\nTop 10 Action Movies:")
print(get_score_based_recommendations('Action'))

Top 10 Weighted_rating:
                        title  vote_average  vote_count  weighted_rating
501                Fight Club           8.3        9413         8.096091
2054             Pulp Fiction           8.3        8428         8.074926
609              Forrest Gump           8.2        7927         7.974196
78                  Inception           8.1       13752         7.970153
77               Interstellar           8.1       10867         7.938750
1281         Schindler's List           8.3        4329         7.903793
2346                 Whiplash           8.3        4254         7.898203
61                 Inside Out           8.0        6560         7.761565
1538       Back to the Future           8.0        6079         7.745505
76    Guardians of the Galaxy           7.9        9742         7.741619

Top 10 Action Movies:
                        title  vote_average  vote_count  weighted_rating
501                Fight Club           8.3        9413         8.096091
78  

In [6]:
# context and weighted-rating based recommendation
data=movies
def context_rating_recommend(user_language=None, user_country=None, runtime_range=None, release_year=None, data=data, top_n=5):
    filtered_data = data.copy()
    
    # based on language
    if user_language:
        filtered_data = filtered_data[filtered_data['language'].str.contains(user_language, na=False)]
    
    # based on production_countries
    if user_country:
        filtered_data = filtered_data[filtered_data['production_countries'].str.contains(user_country, na=False)]
    
    # based on run time
    if runtime_range:
        min_runtime, max_runtime = runtime_range
        filtered_data = filtered_data[(filtered_data['runtime'] >= min_runtime) & (filtered_data['runtime'] <= max_runtime)]
    
    # based on release year
    if release_year:
        filtered_data['release_year'] = pd.to_datetime(filtered_data['release_date'], format='%m/%d/%y', errors='coerce').dt.year
        filtered_data = filtered_data[filtered_data['release_year'] == release_year]
    
    # based on weighted_rating
    return filtered_data.sort_values(by='weighted_rating', ascending=False)['title'].head(top_n).tolist()


print("Context Recommendations (English, USA, 90-150 min, 2015):")
print(context_rating_recommend(
    user_language='English',
    user_country='United States of America',
    runtime_range=(90, 150),
    release_year=2015,
    data=movies
))

Context Recommendations (English, USA, 90-150 min, 2015):
['Inside Out', 'The Martian', 'Spotlight', 'Avengers: Age of Ultron', 'Mad Max: Fury Road']


# supervised-learning recommendation

## use different models to predict 'vote average', then recommendate high 'vote average' movies

In [7]:
data = movies

In [8]:
data['text'] = (
    data['genres'].fillna('') + ' ' +
    data['keywords'].fillna('') + ' ' +
    data['cast_names'].fillna('').str.replace(',', ' ') + ' ' +
    data['crew_names'].fillna('').str.replace(',', ' ')
)
data['release_year'] = pd.to_datetime(data['release_date'], format='%m/%d/%y', errors='coerce').dt.year
x = data[['text', 'language', 'budget', 'runtime', 'popularity', 'release_year']]
y = data['vote_average']

In [9]:
# data preprocess
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, stop_words='english'), 'text'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['language']),
        ('num', StandardScaler(), ['budget', 'runtime', 'popularity', 'release_year'])
    ])

### use linear regression

In [10]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

In [13]:
sample_movie = x_test.iloc[0:1]
sample_index = sample_movie.index[0]
predicted_rating = pipeline.predict(sample_movie)[0]
actual_rating = y_test.loc[sample_index]

print(f"\nPredicted rating for '{data.loc[sample_index, 'title']}': {predicted_rating:.2f}")
print(f"Actual rating: {actual_rating:.2f}")
print(f"Difference: {abs(predicted_rating - actual_rating):.2f}")


Predicted rating for 'End of Watch': 7.32
Actual rating: 7.20
Difference: 0.12


In [14]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (Linear Regression): {mse}")

Mean Squared Error (Linear Regression): 0.6503718206749078


### use random forest

In [15]:
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regress', RandomForestRegressor(n_estimators=100, random_state=42))
])

pipeline_rf.fit(x_train, y_train)
y_pred_rf = pipeline_rf.predict(x_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)

In [16]:
sample_movie = x_test.iloc[0:1]
sample_index = sample_movie.index[0]
predicted_rating_rf = pipeline_rf.predict(sample_movie)[0]
actual_rating = y_test.loc[sample_index]

print(f"\nPredicted rating for '{data.loc[sample_index, 'title']}': {predicted_rating_rf:.2f}")
print(f"Actual rating: {actual_rating:.2f}")
print(f"Difference: {abs(predicted_rating_rf - actual_rating):.2f}")


Predicted rating for 'End of Watch': 7.02
Actual rating: 7.20
Difference: 0.18


In [17]:
print("\nRandom Forest Regression Metrics:")
print(f"MSE: {mse_rf:.4f}")


Random Forest Regression Metrics:
MSE: 0.3990


### use Gradient Boosting Regression

In [18]:
pipeline_gb = Pipeline([
    ('preprocessor', preprocessor),
    ('regress', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

pipeline_gb.fit(x_train, y_train)
y_pred_gb = pipeline_gb.predict(x_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)

In [19]:
sample_movie = x_test.iloc[0:1]
sample_index = sample_movie.index[0]
predicted_rating_gb = pipeline_gb.predict(sample_movie)[0]
actual_rating = y_test.loc[sample_index]

print(f"\nPredicted rating for '{data.loc[sample_index, 'title']}': {predicted_rating_gb:.2f}")
print(f"Actual rating: {actual_rating:.2f}")
print(f"Difference: {abs(predicted_rating_gb - actual_rating):.2f}")


Predicted rating for 'End of Watch': 7.21
Actual rating: 7.20
Difference: 0.01


In [20]:
print("\nGradient Boosting Regression Metrics:")
print(f"MSE: {mse_gb:.4f}")


Gradient Boosting Regression Metrics:
MSE: 0.3873


## establish user_likes content to recommend

In [21]:
# establish a new column, if user like action or adventure movies, and the vote_average is higher than 7, then user likes it.
data['user_likes'] = ((data['genres'].str.contains('Action|Adventure', na=False)) & (data['vote_average'] > 7)).astype(int)
y_user = data['user_likes']
x_train_user, x_test_user, y_train_user, y_test_user = train_test_split(x, y_user, test_size=0.2, random_state=42)


pipeline_user = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_user.fit(x_train_user, y_train_user)
like_probabilities = pipeline_user.predict_proba(x)[:, 1]
recommendations = data.iloc[like_probabilities.argsort()[::-1][:5]]['title'].tolist()
print(like_probabilities)
print("Top 5 Recommended Movies for User (Action/Adventure):", recommendations)

[0.68 0.01 0.09 ... 0.   0.01 0.05]
Top 5 Recommended Movies for User (Action/Adventure): ['Back to the Future Part III', 'Monsters, Inc.', 'Harry Potter and the Half-Blood Prince', 'Harry Potter and the Goblet of Fire', 'Harry Potter and the Order of the Phoenix']


In [23]:
y_pred_user = pipeline_user.predict(x_test_user)
print("Random Forest Classification Report:")
print(classification_report(y_test_user, y_pred_user))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       497
           1       0.00      0.00      0.00        33

    accuracy                           0.94       530
   macro avg       0.47      0.50      0.48       530
weighted avg       0.88      0.94      0.91       530



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
