In [None]:
import pandas as pd
movies = pd.read_csv("https://raw.githubusercontent.com/angelahuangg/Movie-Recommendation/main/data/movies.csv")
links = pd.read_csv("https://raw.githubusercontent.com/angelahuangg/Movie-Recommendation/main/data/links.csv")
ratings = pd.read_csv("https://raw.githubusercontent.com/angelahuangg/Movie-Recommendation/main/data/ratings.csv")
tags = pd.read_csv("https://raw.githubusercontent.com/angelahuangg/Movie-Recommendation/main/data/tags.csv")

In [None]:
df = ratings.copy()
df = ratings.set_index("movieId")

# Merge all data together, drop columns
df = df.merge(movies, left_on="movieId", right_on="movieId")
df = df.merge(links, left_on="movieId", right_on="movieId")
df.drop(columns=['tmdbId'], inplace=True)
df.rename(columns={"userId_x": "userId"}, inplace=True)
df.drop_duplicates(inplace=True)
df = df.set_index("movieId")
df["movieId"] = df.index
df

Unnamed: 0_level_0,userId,rating,timestamp,title,genres,imdbId,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,1
1,5,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,1
1,7,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,1
1,15,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,1
1,17,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,1
...,...,...,...,...,...,...,...
160341,610,2.5,1479545749,Bloodmoon (1997),Action|Thriller,118745,160341
160527,610,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama,66806,160527
160836,610,3.0,1493844794,Hazard (2005),Action|Drama|Thriller,798722,160836
163937,610,3.5,1493848789,Blair Witch (2016),Horror|Thriller,1540011,163937


In [None]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/772.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m757.8/772.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163002 sha256=48b32141a4eaa59adc6a9fdc24146b772029b0b90ef16ea67b3fe19da102cbff
  Stored in directory: /root/.cache/pip/wheels

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["title"])

In [None]:
import pickle

df.to_pickle("./df.pkl")
movies.to_pickle("./movies.pkl")
links.to_pickle("./links.pkl")
ratings.to_pickle("./ratings.pkl")
tags.to_pickle("./tags.pkl")
with open('vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('tfidf.pickle', 'wb') as handle:
    pickle.dump(tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Find searched movie using cosine similarity
def search(title):
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [None]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import ipywidgets as widgets
import pickle
from surprise import accuracy

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)

# Use SVD model, similar to matrix factorization
algo = SVD()

raw_ratings = data.raw_ratings
train_size = int(0.8 * len(raw_ratings))

np.random.seed(42)
np.random.shuffle(raw_ratings)

# Train model
train_raw_ratings = raw_ratings[:train_size]
test_raw_ratings = raw_ratings[train_size:]
test_df = pd.DataFrame(test_raw_ratings, columns=['userId', 'movieId', 'rating', 'timestamp'])

data.raw_ratings = train_raw_ratings
test_data = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
algo.fit(trainset)

# Test model
testset = test_data.build_full_trainset().build_testset()
predictions = algo.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)

pickle.dump(algo,open('model.pkl','wb'))
model=pickle.load(open('model.pkl','rb'))

RMSE: 0.8728
RMSE: 0.87282421167397


In [None]:
# Get similar movies
def get_top_n_recommendations(model, movie_id, n=10):
    # Find similar movies to the given movie
    similar_movie_ids = find_similar_movies_cf_with_genre(movie_id)

    # Predict ratings
    predictions = []
    for movie_id in similar_movie_ids:
        prediction = model.predict(uid='dummy_user', iid=movie_id)
        predictions.append(prediction)

    # Sort predictions by estimated rating in descending order
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Retrieve movie names using their ids
    top_n_movie_titles = []
    for prediction in top_predictions[:n]:
        movie_id = prediction.iid
        movie_title = movies[movies['movieId'] == movie_id]['title'].iloc[0]
        top_n_movie_titles.append(movie_title)

    return top_n_movie_titles

In [None]:
# Find movies with similar genre
def find_similar_movies_cf_with_genre(movie_id, n=10):

    input_movie_genres = movies[movies['movieId'] == movie_id]['genres'].iloc[0]

    # Filter similar movies by genre
    filtered_similar_movie_ids = []
    for movie_id in movies["movieId"]:
        movie_genres = movies[movies['movieId'] == movie_id]['genres'].iloc[0]
        if input_movie_genres in movie_genres:
            filtered_similar_movie_ids.append(movie_id)
    return filtered_similar_movie_ids

In [None]:
# Quick test
results = search("No Game No Life: Zero")
if not results.empty:
    movie_id = results.iloc[0]["movieId"]
    display(get_top_n_recommendations(algo, movie_id))

['Fight Club (1999)',
 'Apocalypse Now (1979)',
 'Princess Bride, The (1987)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Dark Knight, The (2008)',
 'Evil Dead II (Dead by Dawn) (1987)',
 'Great Escape, The (1963)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Outlaw Josey Wales, The (1976)',
 'Boot, Das (Boat, The) (1981)']