# Hybrid System

In this work, the content-based recommender is combined with the SVD++ rating predictor to give high rating recommendations based on the particular user.

In [1]:
import pandas as pd
import numpy as np
import joblib
from surprise import Reader, Dataset, SVDpp
from surprise.model_selection import KFold
from surprise import accuracy

## Load Similarity Matrix Created by Content-Based Recommender System

In [2]:
with open("hybrid/svdpp.joblib", "rb") as f:
    svd = joblib.load(f)
    
dists = np.load("contentSim/metaSim.npy")
md = pd.read_csv("contentSim/metaFeatures.csv")
md.head()

Unnamed: 0,id,genres,title,cast,keywords,director,desc
0,862,"['animation', 'comedy', 'family']",Toy Story,"['tomhanks', 'timallen', 'donrickles']","['jealousi', 'toy', 'boy', 'friendship', 'frie...",['johnlasseter'],animation comedy family tomhanks timallen donr...
1,8844,"['adventure', 'fantasy', 'family']",Jumanji,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","['disappear', 'basedonchildrensbook']",['joejohnston'],adventure fantasy family robinwilliams jonatha...
2,15602,"['romance', 'comedy']",Grumpier Old Men,"['waltermatthau', 'jacklemmon', 'annmargret']","['fish', 'bestfriend', 'duringcreditssting']",['howarddeutch'],romance comedy waltermatthau jacklemmon annmar...
3,31357,"['comedy', 'drama', 'romance']",Waiting to Exhale,"['whitneyhouston', 'angelabassett', 'lorettade...","['basedonnovel', 'interracialrelationship', 's...",['forestwhitaker'],comedy drama romance whitneyhouston angelabass...
4,11862,['comedy'],Father of the Bride Part II,"['stevemartin', 'dianekeaton', 'martinshort']","['babi', 'midlifecrisi', 'confid', 'age', 'dau...",['charlesshyer'],comedy stevemartin dianekeaton martinshort bab...


## Load Rating Data

In [3]:
ratings = pd.read_csv("../data/movies/ratings_small.csv")

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

## Train a Rating Predictor Using SVD++

In [4]:
svd = SVDpp(n_factors=10, n_epochs=20, verbose=True)
kf = KFold(n_splits=3)
for k, (trainSet, validSet) in enumerate(kf.split(data)):
    print(f"\nFold {k+1}")
    svd.fit(trainSet)
    accuracy.mse(svd.test(validSet))


Fold 1
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
MSE: 0.8085

Fold 2
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
MSE: 0.7898

Fold 3
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing 

## Train the Entire Dataset and Save Model

In [6]:
trainSet = data.build_full_trainset()
svd.fit(trainSet)

with open("hybrid/svdpp.joblib", "wb") as f:
    joblib.dump(svd, f)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


## Link `tmdbId` with `movieId`
Ratings trained using `movieId` as input features. However, the similarity matrix is created by the order of the metadata dataframe. 

In [7]:
links = pd.read_csv("../data/movies/links_small.csv")[["movieId", "tmdbId"]]
links = links[links["tmdbId"].notnull()]
links["movieId"] = links["movieId"].astype("int")
links["tmdbId"] = links["tmdbId"].astype("int")

# Rename the columns
links.columns = ["movieId", "id"]

# Merge using `id`, so we have integrated `movieId` column to the metadata
md = md.merge(links[["movieId", "id"]], on="id")
md.head()

Unnamed: 0,id,genres,title,cast,keywords,director,desc,movieId
0,862,"['animation', 'comedy', 'family']",Toy Story,"['tomhanks', 'timallen', 'donrickles']","['jealousi', 'toy', 'boy', 'friendship', 'frie...",['johnlasseter'],animation comedy family tomhanks timallen donr...,1
1,8844,"['adventure', 'fantasy', 'family']",Jumanji,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","['disappear', 'basedonchildrensbook']",['joejohnston'],adventure fantasy family robinwilliams jonatha...,2
2,15602,"['romance', 'comedy']",Grumpier Old Men,"['waltermatthau', 'jacklemmon', 'annmargret']","['fish', 'bestfriend', 'duringcreditssting']",['howarddeutch'],romance comedy waltermatthau jacklemmon annmar...,3
3,31357,"['comedy', 'drama', 'romance']",Waiting to Exhale,"['whitneyhouston', 'angelabassett', 'lorettade...","['basedonnovel', 'interracialrelationship', 's...",['forestwhitaker'],comedy drama romance whitneyhouston angelabass...,4
4,11862,['comedy'],Father of the Bride Part II,"['stevemartin', 'dianekeaton', 'martinshort']","['babi', 'midlifecrisi', 'confid', 'age', 'dau...",['charlesshyer'],comedy stevemartin dianekeaton martinshort bab...,5


## Make Recommendations
First, we find the most similar movies regarding to the given `title`. Based on these filtered movies and the particular `userId`, we make ranking predictions. Finally, the rankings are descendingly orderd and the topk are chosen. For `userId=1` and `userId=100`, we have different rankings for them. We did custom recommendations!

In [13]:
def getHybridTopkRecommendations(userId, title, metadataDf, similarities, simTopk=25, topk=5):
    idx = metadataDf.index[metadataDf["title"] == title].tolist()
    if len(idx) == 0:
        raise ValueError("Title not found!")
    # Choose 1st item and its similarity arr
    idx = idx[0]
    sim = similarities[idx]
    # Set similarity of the given title to the minimum
    sim[idx] = sim.min()
    # Desc sort 
    indices = np.argpartition(-sim, 1+simTopk)[1:1+simTopk]
    movies = metadataDf.iloc[indices, :][["id", "title", "movieId"]]
    # Predict ratings and desc sort
    movies["est_rating"] = movies["movieId"].apply(lambda x: svd.predict(userId, x).est)
    movies = movies.sort_values("est_rating", ascending=False)
    movies = movies.iloc[:topk, :]
    movieRatingPair = dict(zip(movies["title"], movies["est_rating"]))
    
    print(f"\nUser {userId} liked {title}. He/She may like...")
    for i, (title, rating) in enumerate(movieRatingPair.items()):
        print(f"Top{i+1}: {title} {rating:.3f}")
    return 

In [14]:
userId=1
getHybridTopkRecommendations(userId, "The Shawshank Redemption", md, dists)
getHybridTopkRecommendations(userId, "The Truman Show", md, dists)
getHybridTopkRecommendations(userId, "The Godfather", md, dists)

userId=100
getHybridTopkRecommendations(userId, "The Shawshank Redemption", md, dists)
getHybridTopkRecommendations(userId, "The Truman Show", md, dists)
getHybridTopkRecommendations(userId, "The Godfather", md, dists)


User 1 liked The Shawshank Redemption. He/She may like...
Top1: The Godfather 3.214
Top2: In Bruges 3.084
Top3: Once Upon a Time in America 3.037
Top4: Witness 3.023
Top5: Murder in the First 3.014

User 1 liked The Truman Show. He/She may like...
Top1: Naked 3.080
Top2: The Decline of the American Empire 2.820
Top3: Swimming to Cambodia 2.765
Top4: The Man in the White Suit 2.750
Top5: The Lobster 2.723

User 1 liked The Godfather. He/She may like...
Top1: The Shawshank Redemption 3.390
Top2: On the Waterfront 3.282
Top3: In Bruges 3.084
Top4: Murder in the First 3.014
Top5: Rounders 2.942

User 100 liked The Shawshank Redemption. He/She may like...
Top1: The Godfather 4.320
Top2: Once Upon a Time in America 4.027
Top3: In Bruges 4.022
Top4: Witness 3.878
Top5: Murder in the First 3.808

User 100 liked The Truman Show. He/She may like...
Top1: Naked 3.935
Top2: The Decline of the American Empire 3.696
Top3: Swimming to Cambodia 3.677
Top4: The Man in the White Suit 3.592
Top5: Monste