# Hybrid Recommender

A simple weighted hybrid recommendation system using the [Movielens](https://grouplens.org/datasets/movielens/100k/)  dataset

    Model 1 - Content-based model with XGBoost 
    Model 2 - Item-based Collaborative Filtering 

In [None]:
#! pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial.distance import pdist, squareform
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

### Dataset

The MovieLens dataset is hosted by the [GroupLens](https://grouplens.org/datasets/movielens/) website. Several versions are available. We will use the MovieLens 100K dataset :cite:`Herlocker.Konstan.Borchers.ea.1999`.  This dataset is comprised of 100k ratings, ranging from 1 to 5 stars, from 943 users on 1682 movies. 

It has been cleaned up so that each user has rated at least 20 movies. Some simple demographic information such as age, gender, genres for the users and items are also available.  We can download the [ml-100k.zip](http://files.grouplens.org/datasets/movielens/ml-100k.zip) and extract the `u.data` file, which contains all the 100k ratings in the csv format. There are many other files in the folder, a detailed description for each file can be found in the [README](http://files.grouplens.org/datasets/movielens/ml-100k-README.txt) file of the dataset.

In [None]:
# load data
data_path = 'data/ml-100k/'
train_path = data_path + 'u1.base'
test_path = data_path + 'u1.test'

# load train and test data
ratings_train = pd.read_csv(train_path, delimiter = '\t', names = ['userid', 'itemid', 'rating', 'timestamp'])
ratings_test = pd.read_csv(test_path, delimiter = '\t', names = ['userid', 'itemid', 'rating', 'timestamp'])

In [None]:
ratings_train.head()

In [None]:
ratings_train.describe()

In [None]:
temp_df = ratings_train.groupby(['rating']).size()
plt.figure(figsize=(3, 3))
plt.bar(x=temp_df.index, height=temp_df.values, label="Ratings")
plt.title('Ratings')
plt.xlabel('Ratings')
plt.ylabel('Count')
plt.show()

In [None]:
# load user and genre data
user = pd.read_csv(data_path+'u.user', delimiter = '|', names = ['id', 'age', 'gender', 'occupation', 'zip'])[['id', 'age', 'gender', 'occupation']]
genre = pd.read_csv(data_path+'u.genre', delimiter = '|', names = ['genre', 'id']).genre.to_list()

In [None]:
user.info()

In [None]:
user.head()

In [None]:
genre

In [None]:
# load the movie data
item_col_names = ['movie_id','movie_title','release_date','video_release_date','imdb_url'] + genre
item = pd.read_csv(data_path+'u.item', delimiter = '|', names = item_col_names, encoding = 'latin-1')
item.head()

In [None]:
item.info()

In [None]:
plt.figure(figsize=(10,4))
genere_counts = item.loc[:,genre].sum().sort_values(ascending=False)
sns.barplot(x=genere_counts.index, y=genere_counts.values)
plt.xticks(rotation=60)
plt.show()

## Data Transformation 

In [None]:
occupation_col_names =  pd.read_csv(data_path+'u.occupation', delimiter = '|', names = ['occupation'])['occupation'].to_list()

# perform one-hot encoding on the user's occupation column
user[occupation_col_names] = preprocessing.OneHotEncoder(sparse_output = False).fit_transform(user.occupation.to_numpy().reshape(-1,1))

# label encoding on the gender column
user['gender'] = preprocessing.LabelEncoder().fit_transform(user.gender)
user = user.drop(['occupation'], axis = 1)

In [None]:
x_train = ratings_train[['userid', 'itemid']]
y_train = ratings_train[['rating']]
x_test = ratings_test[['userid', 'itemid']]
y_test = ratings_test['rating']

In [None]:
#retain only genre information
item = item[['movie_id'] + genre]

# Merge the movie and user data with our train and test dataset
x_train = x_train.join(user.set_index('id'), on = 'userid').join(item.set_index('movie_id'), on = 'itemid')
x_test = x_test.join(user.set_index('id'), on = 'userid').join(item.set_index('movie_id'), on = 'itemid')

In [None]:
x_train.shape, y_train.shape

In [None]:
x_train.head()

In [None]:
y_train.head()

### Similarity Matrix

In [None]:
#Utility Matrix
utility = ratings_train.pivot(index = 'itemid', columns = 'userid', values = 'rating')
utility = utility.fillna(0)
utility.head()

In [None]:
#use cosine distance between movie ratings
distance_matrix = squareform(pdist(utility, 'cosine'))
similarity_matrix = 1 - distance_matrix

plt.imshow(similarity_matrix, cmap = 'Blues')

## Recommender Models

In [None]:
def single_prediction(userid, itemid, similarity_matrix, utility):
    user_rating = utility.iloc[:,userid-1]
    item_similarity = similarity_matrix[itemid-1]
    numerator = np.dot(user_rating, item_similarity)
    denominator = item_similarity[user_rating > 0].sum()
            
    if numerator == 0 or denominator == 0:
        return user_rating[user_rating>0].mean()
    
    return numerator / denominator

def full_prediction(test_set, pred_func, similarity_matrix, utility, **kwargs):
    pred = []
    for data in test_set:
        res = pred_func(userid = data[0], 
                        itemid = data[1], 
                        similarity_matrix = similarity_matrix, 
                        utility = utility, 
                        **kwargs)
        pred.append(res)
    return pred

### Prediction

In [None]:
# Model 1 - Content Based
model1 = xgb.XGBRegressor(objective='reg:squarederror')
model1.fit(x_train, y_train)

pred1 = model1.predict(x_test)
rmse = np.sqrt(np.mean((pred1 - y_test.to_numpy())**2))
print(f'Content-based RMSE = {rmse}')

In [None]:
# Model 2 - Item Based Collaborative Filter
pred2 = full_prediction(ratings_test[['userid', 'itemid']].to_numpy(),
                      single_prediction,
                      similarity_matrix,
                      utility)
pred2 = np.array(pred2)

rmse = np.sqrt(np.mean((pred2 - y_test.to_numpy())**2))
print(f'Item-item collaborative filtering RMSE = {rmse}')


Build the weighted hybrid by combining the two models in linear manner, which is able to result in better performance. As shown in the plot below, we can see that the weight is best when the weight is around 0.83 in our dataset.

In [None]:
# Predict the rating of user2 on item 3
user_to_predict = 2
item_to_predict = 3

prediction = single_prediction (user_to_predict, item_to_predict,similarity_matrix,utility)
print(prediction)

### Hybrid Model Weights

In [None]:
chart_val = []

lowest_x = 0
lowest_y = 1

w = np.linspace(0,1,21)

for i in w:
    pred4 = pred1*i + pred2*(1-i)
    rmse = np.sqrt(np.mean((pred4 - y_test.to_numpy())**2))
    chart_val.append([i, rmse])
    if rmse <= lowest_y:
        lowest_x = i
        lowest_y = rmse
    

chart_val_np = np.array(chart_val)
plt.plot(chart_val_np[:, 0], chart_val_np[:,1])
plt.plot(lowest_x, lowest_y, marker="o", markersize=5, markeredgecolor="red")
plt.show()