In [2]:
import pandas as pd
import numpy as np

In [3]:
#read text file with beer data
beer_m = utility = pd.read_csv('beer_matrix.txt', sep='\t')
beer_m.head()

Unnamed: 0.1,Unnamed: 0,beer_name,abv,brewery,type
0,0,Darkness - High West Rye Whiskey Barrel Aged (...,12.0,Surly Brewing Company,Russian Imperial Stout
1,1,"Carnegie Porter 5,5%",5.5,Carlsberg Sverige AB,Baltic Porter
2,2,London Porter,6.5,Meantime Brewing Company Limited,English Porter
3,3,Port Barrel Aged Abduction,13.0,Pipeworks Brewing Company,American Imperial Stout
4,4,Echoes From The Well,8.5,LIC Beer Project,American Imperial IPA


In [4]:
#find the duplicate beers
beer_counts = {}
for beer in list(beer_m['beer_name']):
    if not beer in beer_counts:
        beer_counts[beer] = 1
    else:
        beer_counts[beer] += 1

duplicated = []
for name, count in beer_counts.items():    # for name, age in dictionary.iteritems():  (for Python 2.x)
    if count > 1 or count <1:
        duplicated.append(name)

In [5]:
#drop rows with duplicated beers from the beer matrix and reset the index
beer_m = beer_m[~beer_m['beer_name'].isin(duplicated)]
beer_m = beer_m.reset_index()

In [6]:
#get rid of ambiguous column
beer_m = beer_m.iloc[:, 1:]

In [7]:
#read text file with rating data
utility = pd.read_csv('um.txt', sep='\t')
#drop columns with duplicated beer names
utility = utility.drop(columns= duplicated).iloc[:,1:]

In [8]:
#make a copy of the utility dataframe

utility_copy = utility.copy()
utility_copy = utility_copy.iloc[:, 1:]
utility_copy['UID'] = list(range(1, len(utility)+1))
# utility_copy = utility_copy.drop(['user_name'], axis=1)

utility_copy_features = utility_copy.drop(columns=['UID', 'user_name'], axis=1)
utility_copy_features.index.name = 'BID'

In [9]:
#fill null values with zero to prepare for normalization
utility_copy_features = utility_copy_features.fillna(0)
#normalize all values
#convert to float64 datatypes
utility_copy_features = utility_copy_features.astype('float64')
um_vals1 = utility_copy_features.values
#get mean ratings for each row
ratings_mean1 = np.mean(um_vals1, axis=1)
#normalize by subtracting mean from each rating
um_norm1 = um_vals1 - ratings_mean1.reshape(-1, 1)
um_norm1

array([[ 4.17406583,  4.07406583,  4.07406583, ..., -0.00593417,
        -0.00593417, -0.00593417],
       [-0.00644   , -0.00644   , -0.00644   , ..., -0.00644   ,
        -0.00644   , -0.00644   ],
       [-0.00557215, -0.00557215, -0.00557215, ..., -0.00557215,
        -0.00557215, -0.00557215],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00542276, -0.00542276, -0.00542276, ..., -0.00542276,
        -0.00542276, -0.00542276],
       [-0.00566948, -0.00566948, -0.00566948, ...,  4.28433052,
         3.65433052,  4.15433052]])

In [10]:
#make new dataframe with normalized ratings corresponding to user ID
df = pd.DataFrame(um_norm1, columns=list(utility_copy_features.columns))
df['UID'] = list(range(1, len(df)+1))

In [11]:
#rotate dataframe so that every individual rating is its own row
long_matrix1 = pd.melt(df, 
        id_vars='UID', 
        value_vars=utility_copy_features,
        var_name='beer_name',
        value_name='rating')

long_matrix1 = long_matrix1.dropna()
long_matrix1.head()

Unnamed: 0,UID,beer_name,rating
0,1,97 Feet,4.174066
1,2,97 Feet,-0.00644
2,3,97 Feet,-0.005572
3,4,97 Feet,-0.004263
4,5,97 Feet,-0.00679


In [12]:
from surprise import Dataset, Reader
reader = Reader(rating_scale=(0, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(long_matrix1[['UID', 'beer_name', 'rating']], reader)

In [None]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=.25)

In [None]:
from surprise import SVD

algo = SVD()
algo.train(trainset)

uid = "user0"

pred = algo.predict(uid=uid, iid="", verbose=True)
print(pred)

In [None]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
from surprise import SVD, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate
from tqdm import tqdm

benchmark = []
# Iterate over all algorithms
for algorithm in tqdm([SVD(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]):
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=4, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

In [None]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df2 = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df2['Iu'] = df2.uid.apply(get_Iu)
df2['Ui'] = df2.iid.apply(get_Ui)
df2['err'] = abs(df2.est - df2.rui)
best_predictions = df2.sort_values(by='err')[:10]
worst_predictions = df2.sort_values(by='err')[-10:]

In [None]:
df2.head()

In [None]:
def beer_recs(beer, num_recs=5):
    id = df[beer].idxmax()
    user_row = int(df.iloc[id][beer])
    rated, recommended = recommender(predictions_df, user_row, beer_matrix, long_matrix1, num_recs)
    return recommended

In [None]:
beer_recs('90 Minute IPA')

In [None]:
# Recommending top movies not yet rated by user
def recommender(predictions_df, UID, unique_list, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row = UID - 1 # UID starts at 1, not 0
    sorted_predictions = predictions_df.iloc[user_row].sort_values(ascending=False) 
    
    # Get the original user data and merge in the movie information 
    user_data = original_ratings_df[original_ratings_df.UID == (UID)]
    user_full = user_data.merge(unique_list, how = 'left', left_on = 'beer_name', right_on = 'beer_name').sort_values(['rating'], ascending=False)

#     print(sorted_predictions)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = unique_list[~unique_list['beer_name'].isin(user_full['beer_name'])] \
                      .merge(pd.DataFrame(sorted_predictions).reset_index(), 
                             how = 'left', left_on = 'beer_name', right_on = 'BID') \
                      .rename(columns = {user_row: 'Predictions'}) \
                      .sort_values('Predictions', ascending = False) \
                      .iloc[:num_recommendations, :-1]
                    
#     print(recommendations)
    print ('User {0} has already rated {1} beers.'.format(UID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings beers not already rated.'.format(num_recommendations))
    return user_full, recommendations

In [None]:
rated, recommended = recommender(predictions_df, 1, beer_matrix, long_matrix, 5)