# Hybrid Recommender Systems

In [1]:
import numpy as np
import pandas as pd 
import surprise as sp

In [2]:
# data parsing
parsed_data = pd.read_csv("wine-reviews/winemag-data-130k-v2.csv")
filtered_data = parsed_data[['country','province','region_1','variety','price','taster_name','points']]
cleaned_data = filtered_data.rename(columns={'region_1': 'region'}).dropna(subset=['country','province','region','variety','taster_name','points'])

# group all wines from a region that have the same variety, assign mean price
wines_all = cleaned_data.groupby(['country', 'province', 'region', 'variety']).agg({'price': 'mean'}).reset_index()
wines_all = wines_all.assign(id=pd.Series(range(1, wines_all.shape[0]+1), dtype=int, index=wines_all.index))
wines_all = wines_all[['id', 'country', 'province', 'region', 'variety', 'price']]

users_all = cleaned_data.groupby('taster_name').count().reset_index()[['taster_name']]
users_all = users_all.assign(id=pd.Series(range(1, users_all.shape[0]+1), dtype=int, index=users_all.index))

# link ratings to wines and users via id
wine_id_translator = {(row['country'], row['province'], row['region'], row['variety']): row['id'] for index, row in wines_all.iterrows()}
user_id_translator = {row['taster_name']: row['id'] for index, row in users_all.iterrows()}
def get_wine_id_series(data_frame):
    return pd.Series((wine_id_translator[(row['country'], row['province'], row['region'], row['variety'])] for _, row in data_frame.iterrows()), index=data_frame.index)
def get_user_id_series(data_frame):
    return pd.Series((user_id_translator[row['taster_name']] for _, row in data_frame.iterrows()), index=data_frame.index)

# aggregate average points of all ratings from a user for a wine
ratings_all = cleaned_data.assign(wine_id=get_wine_id_series, user_id=get_user_id_series)[['taster_name', 'user_id', 'wine_id', 'points']].groupby(['user_id', 'taster_name', 'wine_id']).mean().reset_index()

# only include wines that have 3 or more ratings
most_rated_wines = list(ratings_all.groupby(['wine_id']).count()[lambda x: x['points'] >= 3].reset_index()['wine_id'].values)

ratings = ratings_all.loc[ratings_all['wine_id'].isin(most_rated_wines)].astype({'wine_id': int, 'user_id': int}).reset_index(drop=True)
wines = wines_all.loc[wines_all['id'].isin(most_rated_wines)].astype({'id': int}).reset_index(drop=True)
users = users_all.loc[users_all['id'].isin(ratings['user_id'].values)].astype({'id': int}).reset_index(drop=True)

In [4]:
wines.head()

Unnamed: 0,id,country,province,region,variety,price
0,739,Canada,Ontario,Niagara Peninsula,Riesling,42.423077
1,741,Canada,Ontario,Niagara Peninsula,Vidal Blanc,62.615385
2,757,France,Alsace,Alsace,Gewürztraminer,34.206897
3,760,France,Alsace,Alsace,Pinot Blanc,17.622047
4,778,France,Alsace,Crémant d'Alsace,Sparkling Blend,24.886256


In [5]:
ratings.head()

Unnamed: 0,user_id,taster_name,wine_id,points
0,1,Alexander Peartree,5069,87.666667
1,1,Alexander Peartree,5737,89.0
2,1,Alexander Peartree,5738,86.75
3,1,Alexander Peartree,5741,86.25
4,1,Alexander Peartree,5743,88.0


In [6]:
users.head()

Unnamed: 0,taster_name,id
0,Alexander Peartree,1
1,Anna Lee C. Iijima,2
2,Anne Krebiehl MW,3
3,Carrie Dykes,4
4,Christina Pickard,5


## Collaborative Filtering

In [7]:
# Collaborative Filtering

def predict_cf(ratings, taster_name, wine_id):
    is_target = (ratings['taster_name'] == taster_name) & (ratings['wine_id'] == wine_id)
    target = ratings[is_target].iloc[0]
    
    train_set = sp.Dataset.load_from_df(
        ratings[~is_target][['user_id', 'wine_id', 'points']], 
        sp.Reader(rating_scale=(0, 100))
    ).build_full_trainset()

    algo = sp.KNNBasic(verbose=False)
    algo.fit(train_set)
    prediction = algo.predict(target['user_id'], target['wine_id'], verbose=False)
    return prediction.est, prediction.est - target['points'], target['points']

## Content-Based

In [8]:
# Content-Based

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

def predict_cn(ratings, wines, taster_name, wine_id):
    user_ratings = ratings[ratings['taster_name'] == taster_name].join(wines.set_index('id'), on='wine_id')
    is_target = (user_ratings['wine_id'] == wine_id)
    
    features = pd.get_dummies(user_ratings.drop(columns=['points']))
    train_features = features[~is_target]
    target_features = features[is_target]
    
    encoder = LabelEncoder()
    train_labels = encoder.fit_transform(user_ratings[~is_target]['points'])
    target_label = user_ratings[is_target]['points'].iloc[0]

    clf = KNeighborsClassifier(n_neighbors=1)
    clf.fit(train_features, train_labels)
    prediction = encoder.inverse_transform(clf.predict(target_features))[0]
    return prediction, prediction - target_label, target_label

## Testing the Recommenders

In [9]:
def test_classifier(taster_name, wine_id):
    pred_cf, error_cf, truth = predict_cf(ratings, taster_name, wine_id)
    pred_cn, error_cn, truth = predict_cn(ratings, wines, taster_name, wine_id)
    print("Results for {} on wine with id {}:".format(taster_name, wine_id))
    print("Collaborative Filtering: \t prediction: {:.5f} \t error: {:.5f}".format(pred_cf, error_cf))
    print("Content-Based: \t\t\t prediction: {:.5f} \t error: {:.5f}".format(pred_cn, error_cn))

In [10]:
test_classifier(taster_name='Anna Lee C. Iijima', wine_id=741)

Results for Anna Lee C. Iijima on wine with id 741:
Collaborative Filtering: 	 prediction: 89.65560 	 error: -0.01107
Content-Based: 			 prediction: 89.50000 	 error: -0.16667


In [11]:
test_classifier(taster_name='Virginie Boone', wine_id=4147)

Results for Virginie Boone on wine with id 4147:
Collaborative Filtering: 	 prediction: 87.93883 	 error: 2.83883
Content-Based: 			 prediction: 85.50000 	 error: 0.40000


In [12]:
def predict_weighted(ratings, wines, taster_name, wine_id):
    prediction_cf, _, truth = predict_cf(ratings, taster_name, wine_id)
    prediction_cn, _, truth = predict_cn(ratings, wines, taster_name, wine_id)
    
    # Weights can be chosen differently, depending on 
    # the (assumed) quality of the recommenders
    prediction = 0.5 * prediction_cf + 0.5 * prediction_cn
    error = prediction - truth
    return prediction, error, truth


pred_weighted, error_weighted, truth = predict_weighted(ratings, wines, taster_name='Anna Lee C. Iijima', wine_id=741)
print("Weighted Hybrid: \t prediction: {:.5f} \t error: {:.5f}".format(pred_weighted, error_weighted))

Weighted Hybrid: 	 prediction: 89.57780 	 error: -0.08887


In [13]:
ratings_matrix = ratings.pivot(index='wine_id', columns='user_id', values='points')
# NaN can be filld with the mean of either 
# the user's or the item's ratings.
ratings_matrix = ratings_matrix.fillna(ratings_matrix.mean())

from sklearn.decomposition import NMF
model = NMF(n_components=3, init='random', random_state=0)
W = model.fit_transform(ratings_matrix)
wines_plus = pd.concat([wines, pd.DataFrame(W)], axis=1)

pred_weighted, error_weighted, truth = predict_cn(ratings, wines_plus, taster_name='Anna Lee C. Iijima', wine_id=741)
print("Weighted Hybrid: \t prediction: {:.5f} \t error: {:.5f}".format(pred_weighted, error_weighted))

Weighted Hybrid: 	 prediction: 89.50000 	 error: -0.16667




In [14]:
def predict_switching(ratings, wines, taster_name, wine_id):
    # The selection of the recommender is done based on the 
    # number of ratings that have been recorded for the item.
    num_ratings = len(ratings[ratings['wine_id'] == wine_id])
    if num_ratings > 3:
        print('Using Collaborative Filtering recommender')
        return predict_cf(ratings, taster_name, wine_id)
    else:
        print('Using Content-Based recommender')
        return predict_cn(ratings, wines, taster_name, wine_id)


pred, error, truth = predict_switching(ratings, wines, taster_name='Anna Lee C. Iijima', wine_id=741)
print("Switching Hybrid: \t prediction: {:.5f} \t error: {:.5f}".format(pred, error))

pred, error, truth = predict_switching(ratings, wines, taster_name='Virginie Boone', wine_id=4147)
print("Switching Hybrid: \t prediction: {:.5f} \t error: {:.5f}".format(pred, error))

Using Collaborative Filtering recommender
Switching Hybrid: 	 prediction: 89.65560 	 error: -0.01107
Using Content-Based recommender
Switching Hybrid: 	 prediction: 85.50000 	 error: 0.40000
