In [None]:
import pandas as pd
import numpy as np
import scipy.stats
# Visualization
import seaborn as sns
# Similarity
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_ratings=pd.read_csv('/content/drive/Shareddrives/rs assignment1/interactions_train.csv')
# Take a look at the data

df_ratings.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [None]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698901 entries, 0 to 698900
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    698901 non-null  int64  
 1   recipe_id  698901 non-null  int64  
 2   date       698901 non-null  object 
 3   rating     698901 non-null  float64
 4   u          698901 non-null  int64  
 5   i          698901 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 32.0+ MB


In [None]:
print('The ratings dataset has', df_ratings['user_id'].nunique(), 'unique users')
# Number of recipes
print('The ratings dataset has', df_ratings['recipe_id'].nunique(), 'unique recipes')
# Number of ratings
print('The ratings dataset has', df_ratings['rating'].nunique(), 'unique ratings')
# List of unique ratings
print('The unique ratings are', sorted(df_ratings['rating'].unique()))

The ratings dataset has 25076 unique users
The ratings dataset has 160901 unique recipes
The ratings dataset has 6 unique ratings
The unique ratings are [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]


In [None]:
# Read in data
df_recipes = pd.read_csv('/content/drive/Shareddrives/rs assignment1/RAW_recipes.csv')
# Take a look at the data
df_recipes.rename(columns = {'id':'recipe_id'}, inplace = True)


In [None]:
new_df = df_ratings.merge(df_recipes,on = 'recipe_id')
small_df = new_df.sample(frac = 0.02)

In [None]:
ratings = small_df[['user_id','recipe_id','rating']]
recipes = small_df[['recipe_id','name']]

In [None]:
from sklearn.preprocessing import LabelEncoder
def ids_encoder(ratings):
    users = sorted(ratings['user_id'].unique())
    items = sorted(ratings['recipe_id'].unique())
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()
    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)
    # encode userids and itemids
    ratings.user_id = uencoder.transform(ratings.user_id.tolist())
    ratings.recipe_id = iencoder.transform(ratings.recipe_id.tolist())

    return ratings, uencoder, iencoder

In [None]:
ratings, uencoder, iencoder = ids_encoder(ratings)
ratings.head()
ratings.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


(13978, 3)

In [None]:
def normalize():
    # compute mean rating for each user
    mean = ratings.groupby(by='user_id', as_index=False)['rating'].mean()
    norm_ratings = pd.merge(ratings, mean, suffixes=('','_mean'), on='user_id')

    # normalize each rating by substracting the mean rating of the corresponding user
    norm_ratings['norm_rating'] = norm_ratings['rating'] - norm_ratings['rating_mean']
    return mean.to_numpy()[:, 1], norm_ratings

In [None]:
mean, norm_ratings = normalize()
np_ratings = norm_ratings.to_numpy()
norm_ratings.head()

Unnamed: 0,user_id,recipe_id,rating,rating_mean,norm_rating
0,1871,533,5.0,5.0,0.0
1,1871,7062,5.0,5.0,0.0
2,2045,3083,5.0,5.0,0.0
3,4095,6194,5.0,5.0,0.0
4,1365,5371,5.0,5.0,0.0


In [None]:
def item_representation(ratings):
    return csr_matrix(
        pd.crosstab(ratings.recipe_id, ratings.user_id, ratings.norm_rating, aggfunc=sum).fillna(0).values
    )

In [None]:
R = item_representation(norm_ratings)

In [None]:
def create_model(rating_matrix, k=20, metric="cosine"):
    """
    :param R : numpy array of item representations
    :param k : number of nearest neighbors to return
    :return model : our knn model
    """
    model = NearestNeighbors(metric=metric, n_neighbors=k+1, algorithm='brute')
    model.fit(rating_matrix)
    return model

In [None]:
def nearest_neighbors(rating_matrix, model):
    """
    compute the top n similar items for each item.
    :param rating_matrix : items representations
    :param model : nearest neighbors model
    :return similarities, neighbors
    """
    similarities, neighbors = model.kneighbors(rating_matrix)
    return similarities[:,1:], neighbors[:,1:]

In [None]:
import sys
def cosine(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def adjusted_cosine(np_ratings, nb_items):
    similarities = np.zeros(shape=(nb_items, nb_items))
    similarities.fill(-1)
    def _progress(count):
        sys.stdout.write('\rComputing similarities. Progress status : %.1f%%' % (float(count / nb_items)*100.0))
        sys.stdout.flush()

    items = sorted(ratings.recipe_id.unique())
    for i in items[:-1]:
        for j in items[i+1:]:
            scores = np_ratings[(np_ratings[:, 1] == i) | (np_ratings[:, 1] == j), :]
            vals, count = np.unique(scores[:,0], return_counts = True)
            scores = scores[np.isin(scores[:,0], vals[count > 1]),:]

            if scores.shape[0] > 2:
                x = scores[scores[:, 1].astype('int') == i, 4]
                y = scores[scores[:, 1].astype('int') == j, 4]
                w = cosine(x, y)

                similarities[i, j] = w
                similarities[j, i] = w
        _progress(i)
    _progress(nb_items)
    # get neighbors by their neighbors in decreasing order of similarities
    neighbors = np.flip(np.argsort(similarities), axis=1)

    # sort similarities in decreasing order
    similarities = np.flip(np.sort(similarities), axis=1)

    # save similarities to disk
    #save_similarities(similarities, neighbors, dataset_name=dataset_name)

    return similarities, neighbors

In [None]:
metric = 'cosine'

if metric == 'adjusted_cosine':
    nb_items = ratings.recipe_id.nunique()
    similarities, neighbors = adjusted_cosine(np_ratings, nb_items=nb_items)
else:
    model = create_model(R, k=21, metric=metric)
    similarities, neighbors = nearest_neighbors(R, model)

In [None]:
print('neighbors shape : ', neighbors.shape)
print('similarities shape : ', similarities.shape)

neighbors shape :  (11392, 21)
similarities shape :  (11392, 21)


In [None]:
def candidate_items(userid):
    """
    :param userid : user id for which we wish to find candidate items
    :return : I_u, candidates
    """

    # 1. Finding the set I_u of items already rated by user userid
    I_u = np_ratings[np_ratings[:, 0] == userid]
    I_u = I_u[:, 1].astype('int')

    # 2. Taking the union of similar items for all items in I_u to form the set of candidate items
    c = set()

    for iid in I_u:
        # add the neighbors of item iid in the set of candidate items
        c.update(neighbors[iid])

    c = list(c)
    # 3. exclude from the set C all items in I_u.
    candidates = np.setdiff1d(c, I_u, assume_unique=True)

    return I_u, candidates

In [None]:
test_user = ratings['user_id'].iloc[0]
#test_user = uencoder.transform(ratings.user_id.tolist())[0]
print(test_user)
i_u, u_candidates = candidate_items(test_user)

1871


In [None]:
print('number of items purchased by test user : ',len(i_u))
print('number of candidate items for test user : ',len(u_candidates))

number of items purchased by test user :  2
number of candidate items for test user :  22


In [None]:
def similarity_with_Iu(c, I_u):
    """
    compute similarity between an item c and a set of items I_u. For each item i in I_u, get similarity between
    i and c, if c exists in the set of items similar to itemid.
    :param c : itemid of a candidate item
    :param I_u : set of items already purchased by a given user
    :return w : similarity between c and I_u
    """
    w = 0
    for iid in I_u :
        # get similarity between itemid and c, if c is one of the k nearest neighbors of itemid
        if c in neighbors[iid] :
            w = w + similarities[iid, neighbors[iid] == c][0]
    return w

In [None]:
def rank_candidates(candidates, I_u):
    """
    rank candidate items according to their similarities with i_u
    :param candidates : list of candidate items
    :param I_u : list of items purchased by the user
    :return ranked_candidates : dataframe of candidate items, ranked in descending order of similarities with I_u
    """

    # list of candidate items mapped to their corresponding similarities to I_u
    sims = [similarity_with_Iu(c, I_u) for c in candidates]
    candidates = iencoder.inverse_transform(candidates)
    mapping = list(zip(candidates, sims))

    ranked_candidates = sorted(mapping, key=lambda couple:couple[1], reverse=True)
    return ranked_candidates

In [None]:
def topn_recommendation(userid, N=30):
    """
    Produce top-N recommendation for a given user
    :param userid : user for which we produce top-N recommendation
    :param n : length of the top-N recommendation list
    :return topn
    """
    # find candidate items
    I_u, candidates = candidate_items(userid)

    # rank candidate items according to their similarities with I_u
    ranked_candidates = rank_candidates(candidates, I_u)

    # get the first N row of ranked_candidates to build the top N recommendation list
    topn = pd.DataFrame(ranked_candidates[:N], columns=['recipe_id','similarity_with_Iu'])
    topn = topn.merge(recipes, on = 'recipe_id')
    topn.sort_values(by=['similarity_with_Iu'], inplace=True, ascending=False)
    topn_10 = topn.iloc[:10]
    return topn_10

In [None]:
topn_recommendation(test_user)


Unnamed: 0,recipe_id,similarity_with_Iu,name
0,208758,2.0,paula deen s cornbread
11,209214,2.0,extreme low fat chocolate cake
20,209715,2.0,mexican cosmo
19,209715,2.0,mexican cosmo
18,209670,2.0,crispy salsa chicken
17,209626,2.0,salmon with green beans and honey mustard sauce
16,209492,2.0,honey lime oat muffins
15,209457,2.0,7 layer salad for one
14,209436,2.0,chicken with pineapple sauce ww 5 points
13,209427,2.0,herb cheese quick bread


In [None]:
def predict(userid, itemid):
    """
    Make rating prediction for user userid on item itemid
    :param userid : id of the active user
    :param itemid : id of the item for which we are making prediction
    :return r_hat : predicted rating
    """

    # Get items similar to item itemid with their corresponding similarities
    item_neighbors = neighbors[itemid]
    item_similarities = similarities[itemid]

    # get ratings of user with id userid
    uratings = np_ratings[np_ratings[:, 0].astype('int') == userid]

    # similar items rated by item the user of i
    siru = uratings[np.isin(uratings[:, 1], item_neighbors)]
    scores = siru[:, 2]
    indexes = [np.where(item_neighbors == iid)[0][0] for iid in siru[:,1].astype('int')]
    sims = item_similarities[indexes]

    dot = np.dot(scores, sims)
    som = np.sum(np.abs(sims))

    if dot == 0 or som == 0:
        return mean[userid]

    return dot / som

In [None]:
examples = ratings[['user_id', 'recipe_id']].values
labels = ratings['rating'].values

In [None]:
from sklearn.model_selection import train_test_split
train_examples, test_examples, train_labels, test_labels = train_test_split(
        examples,
        labels,
        test_size=0.2,
        random_state=42,
        shuffle=True)
# transform train and test examples to their corresponding one-hot representations
train_users = train_examples[:, 0]
test_users = test_examples[:, 0]

train_items = train_examples[:, 1]
test_items = test_examples[:, 1]

    # Final training and test set
x_train = np.array(list(zip(train_users, train_items)))
x_test = np.array(list(zip(test_users, test_items)))

y_train = train_labels
y_test = test_labels

In [None]:
def evaluate(x_test, y_test):
        print('Evaluate the model on {} test data ...'.format(x_test.shape[0]))
        preds = list(predict(u, i) for (u, i) in x_test)
        mae = np.sum(np.absolute(y_test - np.array(preds))) / x_test.shape[0]
        print()
        print('MAE :', mae)
        return mae

In [None]:
evaluate(x_test, y_test)

Evaluate the model on 2796 test data ...

MAE : 0.22374027835862562


0.22374027835862562