# DATA643: Recommender System
## Final Project
### <i>Harpreet Shoker, Rose Jones, Summer 2018 </i>

## Notebook3_Impilcit Matrix Factorization_ALS

In this notebook, we perform matrix factorization using Alternating Least Squares on implicit feedback data.

#### pythorn 3.4

---

## Get Data

In [1]:
import os
from pathlib import Path

datasets_path = os.path.join(os.getcwd(), 'data')
dt_path = os.path.join(datasets_path, 'instacart_2017_05_01.tar.gz')

In [2]:
from subprocess import check_output
print(check_output(["ls", "./data/instacart_2017_05_01"]).decode("utf8"))

aisles.csv
departments.csv
order_products__prior.csv
order_products__train.csv
orders.csv
products.csv



In [3]:
import pandas as pd

# Order and User dataset
order_products_prior = pd.read_csv('./data/instacart_2017_05_01/order_products__prior.csv')
order_products_train = pd.read_csv('./data/instacart_2017_05_01/order_products__train.csv')
orders = pd.read_csv('./data/instacart_2017_05_01/orders.csv')
# Products dataset
products = pd.read_csv('./data/instacart_2017_05_01/products.csv')
# Merge prior orders and products
merged_order_products_prior = pd.merge(order_products_prior, products, on="product_id", how="left")

## Test data

In [4]:
def test_data(path, orders, order_products_train):
    """
    Make test data and save it in the given path as .csv
    """
    # read `orders` and filter eval_set == train
    orders_train = orders.loc[(orders.eval_set == "train")].reset_index()
    orders_userid = orders_train[["order_id", "user_id"]]
    
    # `orders_userid` and `order_products_train` lengths should match
    assert len(orders_userid["order_id"].unique()) == len(order_products_train["order_id"].unique())

    # Convert `order_products`_train as same format
    orders_productid = order_products_train[["order_id", "product_id"]]
    orders_productid = orders_productid.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # `orders_products_train` and `orders_productid` size should match
    assert orders_productid.size == orders_userid.size

    # merge `orders_userid` and `orders_productid` on order_id
    user_products_test = pd.merge(orders_userid, orders_productid, on="order_id")
    user_products_test = user_products_test[["user_id", "products"]]

    # save as .csv
    user_products_test.to_csv(path, index_label=False)

In [5]:
import time

In [6]:
%%time
# Generate test data if it doesn't exist
if_test_data_exists = False
test_data_path = "./data/user_products__test.csv"

if if_test_data_exists or not Path(test_data_path).is_file():
    test_data(test_data_path, orders, order_products_train)

user_products_test_df = pd.read_csv(test_data_path)

CPU times: user 184 ms, sys: 35.3 ms, total: 219 ms
Wall time: 220 ms


In [7]:
print(user_products_test_df.shape)
user_products_test_df.head()

(131209, 2)


Unnamed: 0,user_id,products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


## Utility Matrix

In [8]:
def user_item_prior_df(path, orders, order_products_prior):
    """
    Make prior user-product dataframe and save it as .csv
    """   
    
    # read `orders` and filter eval_set == prior
    orders_user_prior = orders.loc[orders.eval_set == "prior"]
    orders_user_prior = orders_user_prior[["order_id", "user_id"]]
    
    # merge `orders_user_prior` and `order_products_prior` on order_id
    merged = pd.merge(orders_user_prior, order_products_prior[["order_id", "product_id"]], on="order_id")
    user_item_prior = merged[["user_id", "product_id"]]
    user_item_prior = user_item_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # save as .csv
    user_item_prior.to_csv(path, index_label=False)

In [9]:
%%time
# Generate users prior purchases data if it doesn't exist
if_user_prod_df_exists = True
matrix_df_path = "./data/user_products__prior.csv"

if if_user_prod_df_exists or not Path(matrix_df_path).is_file():
    user_item_prior_df(matrix_df_path, orders, order_products_prior)

user_item_prior = pd.read_csv(matrix_df_path)
user_item_prior["user_id"] = user_item_prior["user_id"].astype("category")
user_item_prior["product_id"] = user_item_prior["product_id"].astype("category")

CPU times: user 1min 18s, sys: 10.1 s, total: 1min 28s
Wall time: 1min 21s


In [10]:
print(user_item_prior.shape)
user_item_prior.head()

(13307953, 3)


Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


## User-Item Matrix

In [11]:
import scipy.sparse as sparse
import numpy as np

def build_user_item_matrix(path, user_item_prior):
    """
    make user-item matrix that displays order history of users, save it as .csv
    rows = products
    columns = users
    """
    user_item_matrix = sparse.coo_matrix((user_item_prior["quantity"],
                                          (user_item_prior["product_id"].cat.codes.copy(),
                                           user_item_prior["user_id"].cat.codes.copy())))    
    sparse.save_npz(path, user_item_matrix)

In [12]:
# Build dataframe of users, products and quantity bought using prior datasets
if_user_item_matrix_exists = False
matrix_path = "./data/user_item_matrix.npz"

if if_user_item_matrix_exists or not Path(matrix_path).is_file():
    build_user_item_matrix(matrix_path, user_item_prior)  

user_item_matrix=sparse.load_npz(matrix_path).tocsr().astype(np.float32)

In [13]:
user_item_matrix.shape
user_item_matrix

<49677x206209 sparse matrix of type '<class 'numpy.float32'>'
	with 13307953 stored elements in Compressed Sparse Row format>

In [14]:
sparsity = (1 - (user_item_matrix.size / (user_item_matrix.shape[0] * user_item_matrix.shape[1])))
print(('The sparsity of user_item_matrix is ') +  str(round(sparsity,6)*100) + '%')

The sparsity of user_item_matrix is 99.8701%


## Alternate Least Squares - Implicit Matrix Factorization

In [15]:
import pickle
import implicit
from implicit.als import AlternatingLeastSquares


def confidence_matrix(user_item_matrix, alpha):
    """
    Given user-item-matrix, returns the given matrix converted to a confidence matrix.
    """
    return (user_item_matrix * alpha).astype("float")


def build_implicit_matrix_factorization(user_item_matrix, **kwargs):
    """
    Given user-item-matrix and model parameters, builds models and save.
    """
    
    # Build model
    model = AlternatingLeastSquares()
    model.approximate_similar_items = False
    
    model.fit(confidence_matrix(user_item_matrix, kwargs["alpha"]))

    # Save model
    with open(kwargs["path"], "wb+") as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

In [16]:
# Specify model params and build it
model_params = {"alpha": 23} 
model_params["path"] = "./models/implicit_matrix_factorization/{}.imf".format(model_params["alpha"])

In [17]:
if_model_exists = False
if if_model_exists or not Path(model_params["path"]).exists():
    build_implicit_matrix_factorization(user_item_matrix, **model_params)

In [18]:
with open(model_params["path"], "rb") as f:
    imf_model = pickle.load(f)

## Example recommendation

In [19]:
# As the user-item matrix is 0 indexed, the dict is required to convert between `ids` and `indices`
# e.g. `product_id` 1 represents by the `0`th row of the user-item matrix.

# Maps user_id: user index
u_dict = {uid:i for i, uid in enumerate(user_item_prior["user_id"].cat.categories)}

# Maps product_index: product id
p_dict = dict(enumerate(user_item_prior["product_id"].cat.categories))

In [20]:
# Recommend items for a user 23
user_id = 23
recommendations = imf_model.recommend(u_dict[user_id], user_item_matrix.T.tocsr(), N = 10)

#### Actual purchase list of User

In [21]:
# Actual Purchase List
row = user_products_test_df.loc[user_products_test_df.user_id == user_id]
actual = list(row["products"])
actual = actual[0][1:-1]
actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))

actual_products = []
for pid in actual:
    actual_products.extend((products.loc[products.product_id == pid].product_name).tolist())

print("\nUSER {} PURCHASE LIST: \n\n{}".format(user_id, actual_products))


USER 23 PURCHASE LIST: 

['3 Color Deli Coleslaw', 'Pineapple on the Bottom Greek Yogurt', 'Original French Vanilla Yogurt', 'Febreze Lavender Vanilla & Comfort Scent Sweeper Dry Pad Refills', 'Recipe Secrets Onion Soup & Dip Mix', 'Extra Noodle Soup Mix', 'Chicken Bouillon Cubes', 'Natural Goodness 33% Less Sodium Chicken Broth', 'Low Fat Key Lime Blended Greek Yogurt', 'Original Mountain Blueberry Low Fat Yogurt', '3 Gallon Food Scrap Bag', 'Spinach']


#### Recommended purchase list

In [22]:
# Recommended List
r = [p_dict[r[0]] for r in recommendations] # Takes the product_cat_code and maps to product_id

recommended_products = []
for pid in r:
    recommended_products.extend((products.loc[products.product_id == pid].product_name).tolist())
print("\nRecommendations for USER {}\n{}".format(user_id, recommended_products))


Recommendations for USER 23
['Peach on the Bottom Nonfat Greek Yogurt', 'Strawberry on the Bottom Nonfat Greek Yogurt', 'Blueberry on the Bottom Nonfat Greek Yogurt', 'Raspberry on the Bottom Nonfat Greek Yogurt', 'Non Fat Black Cherry on the Bottom Greek Yogurt', '100% Whole Wheat Bread', 'Low-Fat Strawberry Banana on the Bottom Greek Yogurt', 'Hass Avocado', 'Original No Pulp 100% Florida Orange Juice', 'Coconut Blended Greek Yogurt']


#### Conclusion

<h5> We can state that similarities shows in the purchase list of the `User 23` and the recommended list.  For example, the recommended Nonfat yogurts are appropriate alternative to purchased yogurt.  This recommender system is discovery based, thus, it recommends products that have never been purchased. As a result, for evaluation, we remove previously purchased products from the actual purchase list of the user. </h5>

## Evaluation using `Recall`

In [23]:
def get_k_popular(k, merged_order_products_prior):
    """
    Returns the `k` most popular products based on purchase count in the dataset
    """
    popular_products = list(merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products

In [24]:
# Transpose the product_user utility matrix
user_product_matrix = user_item_matrix.T.tocsr()

# Number of recommendations
number_of_recommendations = 10

# Get the `number_of_recommendations` most popular products
popular_products = get_k_popular(number_of_recommendations, merged_order_products_prior)

In [25]:
def recall_score(actual, pred):
    """
    Given the actual, prediction values of list, returns the recall of the prediction.
    """
    if len(actual) == 0:
        return 0
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)


def new_products(row):
    """
    Given a row in the test dataset, returns the list of newly purchased products.
    """
    actual = row["products"][1:-1]                                                          # Products purchased currently 
    actual = set([int(p.strip()) for p in actual.strip().split(",")])
    liked = set([p_dict[i] for i in user_product_matrix[u_dict[row["user_id"]]].indices])   # User's purchase history
    return actual - liked                                                                   # Return only new products purchased


def popular_recommend(row):
    """
    Given a row in the test dataset, returns the recall score when popular products are recommended.
    """
    actual = new_products(row)
    return recall_score(actual, popular_products)


def imf_recommend(row):
    """
    Given a row in the test dataset, returns the recall score when our model recommends products.
    """
    actual = new_products(row)
    recommended = imf_model.recommend(u_dict[row["user_id"]], user_product_matrix, N=number_of_recommendations)
    recommended = [p_dict[r[0]] for r in recommended]
    return recall_score(actual, recommended)


def build_eval_df(user_products_test_df, filepath=None, subset=None):
    """
    Builds recall values dataframe of the baseline and implicit matrix factorization model
    for all the users in the test dataset and save it as .csv.
    """

    df_eval = user_products_test_df.copy()
    if subset:
        df_eval = df_eval.sample(n=int(len(df_eval) * subset), random_state=7)
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval["imf_score"] = df_eval.apply(imf_recommend, axis=1)

    df_eval.to_csv(filepath)

In [26]:
# recall value dataframe of the baseline and implicit matrix factorization model

if_evaluation_df_exists = True
subset = 0.2  # Evaluate on `subset x 100`% of the test dataset
eval_path = "./data/eval/eval_discovery_{}_{}.csv".format(subset if subset is not None else "full", number_of_recommendations)

if if_evaluation_df_exists or not Path(eval_path).exists():
    build_eval_df(user_products_test_df, filepath=eval_path, subset=subset)
df_eval = pd.read_csv(eval_path)

## Outcome

In [27]:
# Mean recall scores
model_mean_recall, baseline_mean_recall = np.mean(df_eval["imf_score"]), np.mean(df_eval["popular_score"])
print("Implicit Matrix Factorization Model: {:.2f}%".format(model_mean_recall * 100))
print("Baseline: {:.2f}%".format(baseline_mean_recall * 100))

Implicit Matrix Factorization Model: 3.83%
Baseline: 2.62%


##### Implicit matrix factorization model performss better than baseline model for recommendations.

---