In [1]:
from tqdm import tqdm
from ast import literal_eval
from collections import defaultdict
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

import os
import heapq
import zipfile
import pandas as pd
import numpy as np
import scipy.sparse as sparse

In [2]:
# Some logistics helping functions
class colors:
    """Color used for printing."""
    HEADER = '\033[95m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

def print_success(message):
    """Specific print func for this notebook."""
    print(f"{colors.HEADER}[Recommender Message]{colors.ENDC} - {colors.OKGREEN}{message}{colors.ENDC}")

def print_failure(message):
    """Specific error func for this notebook."""
    print(f"{colors.HEADER}[Recommender Failure]{colors.ENDC} - {colors.FAIL}{message}{colors.ENDC}")

def print_warning(message):
    """Specific warning func for this notebook."""
    print(f"{colors.HEADER}[Recommender Warning]{colors.ENDC} - {colors.WARNING}{message}{colors.ENDC}")

# Preparation of training and testing data

In [3]:
def extract_csv_data(zip_path, data_path):
    """Extract and retrieve csv data"""
    # Get files in the zip_path
    zip_files = [os.path.join(zip_path, f) for f in os.listdir(zip_path)]
    print_success('Files in ' + zip_path + ':\n' + str(zip_files))
    
    # Pass if data_path already exists
    if os.path.exists(data_path):
        print_warning('Extracted data (%s) are already existed.' % data_path)
        return
    
    # Store the extracted csv files in data_path
    for zip_file in zip_files:
        if zipfile.is_zipfile(zip_file):
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall(data_path)
    print_success('All zip files are extracted.')


def get_training_data(data_path, store_path):
    """Define the prior(eval_set="prior") orders as the training dataset."""
    prodsPerUser_filename = os.path.join(store_path, 'productsPerUser_train.csv')
    userProdFreq_filename = os.path.join(store_path, 'user_product_frequency_train.csv')
    prodFreq_filename = os.path.join(store_path, 'product_frequency_train.csv')
    
    # Pass if files are already existed
    if os.path.exists(prodsPerUser_filename) and os.path.exists(userProdFreq_filename) and os.path.exists(prodFreq_filename):
        print_warning('Training data are already existed.')
        return pd.read_csv(prodFreq_filename), \
            pd.read_csv(userProdFreq_filename), \
            pd.read_csv(prodsPerUser_filename)
    
    try:
        # Load the csv files as dataframes
        df_orders = pd.read_csv(os.path.join(data_path, 'orders.csv'))
        df_order_products_train = pd.read_csv(os.path.join(data_path, 'order_products__prior.csv'))
        
        # Trim the unnecessary columns
        df_order_products_train = df_order_products_train[["order_id", "product_id"]]
        
        # Get the frequency of occurrence for each product (ready for tf-idf)
        df_product_frequency = df_order_products_train['product_id'].value_counts()
        df_product_frequency = df_product_frequency.rename_axis('product_id').reset_index(name='frequency')
        print_success('Calculation of product frequency is completed.')
        
        # Get the direct relation between products and users
        df_usersAndProducts_train = pd.merge(df_orders, df_order_products_train, on='order_id', how='inner')
        df_usersAndProducts_train = df_usersAndProducts_train[['user_id', 'product_id']]
        df_productsPerUser_train = df_usersAndProducts_train.groupby('user_id').agg(set).reset_index()
        print_success('Calculation of productsPerUser is completed.')
        
        # Get the frequency of occurence for each user-product pair
        df_user_product_frequency = df_usersAndProducts_train.groupby(['user_id', 'product_id'])\
            .size().reset_index().rename(columns={0: 'frequency'})
        print_success('Calculation of user-product-pair frequency is completed.')
        
        # Store the precessed data to enhance efficiency
        if not os.path.exists(store_path):
            os.mkdir(store_path)
        df_productsPerUser_train.to_csv(prodsPerUser_filename, index_label=False)
        df_user_product_frequency.to_csv(userProdFreq_filename, index_label=False)
        df_product_frequency.to_csv(prodFreq_filename, index_label=False)
        
        print_success('Training data are retrieved and saved.')
        return df_product_frequency, df_user_product_frequency, df_productsPerUser_train
    except Exception as e: 
        print_failure(str(e))

        
def get_testing_data(data_path, store_path):
    """Define the current(eval_set="train") orders as the testing dataset."""
    test_filename = os.path.join(store_path, 'productsPerUser_test.csv')
    
    # Pass if file is already existed
    if os.path.exists(test_filename):
        print_warning('Testing data are already existed.')
        return pd.read_csv(test_filename)
    
    try:
        # Load the csv files as dataframes
        df_orders = pd.read_csv(os.path.join(data_path, 'orders.csv'))
        df_order_products_test = pd.read_csv(os.path.join(data_path, 'order_products__train.csv'))
        
        # Trim the unnecessary columns
        df_order_products_test = df_order_products_test[["order_id", "product_id"]]
        
        # Get the direct relation between products and users
        df_usersAndProducts_test = pd.merge(df_orders, df_order_products_test, on='order_id', how='inner')
        df_usersAndProducts_test = df_usersAndProducts_test[['user_id', 'product_id']]
        df_productsPerUser_test = df_usersAndProducts_test.groupby('user_id').agg(set).reset_index()
        
        # Store the precessed data to enhance efficiency
        if not os.path.exists(store_path):
            os.mkdir(store_path)
        df_productsPerUser_test.to_csv(test_filename, index_label=False)
        
        print_success('Testing data are retrieved and saved.')
        return df_productsPerUser_test
    except Exception as e: 
        print_failure(str(e))

        
def get_category_data(data_path):
    """Get the other category csv datasets."""
    try:
        df_aisles = pd.read_csv(os.path.join(data_path, 'aisles.csv'))
        df_departments = pd.read_csv(os.path.join(data_path, 'departments.csv'))
        df_products = pd.read_csv(os.path.join(data_path, 'products.csv'))
        print_success('Category data are retrieved.')
        return df_aisles, df_departments, df_products
    except Exception as e:
        print_failure(str(e))

In [4]:
# Constants
zip_path = './instacart-market-basket-analysis'
data_path = './extracted_dataset'
store_path = './train_test_data'

# Data preparation
extract_csv_data(zip_path, data_path)
df_product_frequency, df_user_product_frequency, df_productsPerUser_train = get_training_data(data_path, store_path)
df_productsPerUser_test = get_testing_data(data_path, store_path)
df_aisles, df_departments, df_products = get_category_data(data_path)

[95m[Recommender Message][0m - [92mFiles in ./instacart-market-basket-analysis:
['./instacart-market-basket-analysis/order_products__prior.csv.zip', './instacart-market-basket-analysis/order_products__train.csv.zip', './instacart-market-basket-analysis/aisles.csv.zip', './instacart-market-basket-analysis/orders.csv.zip', './instacart-market-basket-analysis/departments.csv.zip', './instacart-market-basket-analysis/products.csv.zip', './instacart-market-basket-analysis/sample_submission.csv.zip'][0m
[95m[Recommender Message][0m - [92mCategory data are retrieved.[0m


**Data overview**

In [5]:
df_product_frequency.head(2)

Unnamed: 0,product_id,frequency
0,24852,472565
1,13176,379450


In [6]:
df_user_product_frequency.head(2)

Unnamed: 0,user_id,product_id,frequency
0,1,196,10
1,1,10258,9


In [7]:
df_productsPerUser_train.head(2)

Unnamed: 0,user_id,product_id
0,1,"{17122, 196, 26405, 14084, 46149, 26088, 13032..."
1,2,"{45066, 2573, 18961, 1559, 32792, 23, 22559, 1..."


In [8]:
df_productsPerUser_test.head(2)

Unnamed: 0,user_id,product_id
0,1,"{196, 26405, 27845, 46149, 13032, 39657, 26088..."
1,2,"{24838, 11913, 45066, 31883, 48523, 38547, 248..."


# User-product frequency sparse matrix

In [9]:
def build_user_product_matrix(df_user_product_frequency, matrix_file_path, matrix_name):
    """Build and store coo/csr sparse matrix of user-product matrix."""
    matrix_path = os.path.join(matrix_file_path, matrix_name)
    if os.path.exists(matrix_path):
        print_warning('User-product matrix is already existed.')
        return sparse.load_npz(matrix_path).tocsr()
    
    df_user_product_frequency['user_id'] = df_user_product_frequency['user_id'].astype('category')
    df_user_product_frequency['product_id'] = df_user_product_frequency['product_id'].astype('category')
    
    # Define sparse user-product matrix in coo format
    data = df_user_product_frequency['frequency']
    row = df_user_product_frequency['user_id'].cat.codes.copy()
    col = df_user_product_frequency['product_id'].cat.codes.copy()
    user_product_matrix = sparse.coo_matrix((data, (row, col)))
    
    # Store and return the sparse matrix
    if not os.path.exists(matrix_file_path):
        os.mkdir(matrix_file_path) 
    sparse.save_npz(matrix_path, user_product_matrix)
    print_success('User-product matrix is stored at %s' % matrix_path)
    return user_product_matrix.tocsr()

In [10]:
# Constants
train_matrix_path = './matrixes'
train_matrix_name = 'user_product_train.npz'

# Generate sparse matrix for training
user_product_matrix_train = build_user_product_matrix(df_user_product_frequency, train_matrix_path, train_matrix_name)
user_product_matrix_train



<206209x49677 sparse matrix of type '<class 'numpy.int64'>'
	with 13307953 stored elements in Compressed Sparse Row format>

user number:  206209 \
product number:  49677

In [11]:
# Example of visiting element in sparse matrix
user_product_matrix_train[0, 195] # user_id = 1, product_id = 196

10

# User-product TF-IDF sparse matrix

In [12]:
def build_tfidf_matrix(tf):
    """Build tf-idf sparse matrix for product. 'tf' refers to term frequency."""
    tf_idf = coo_matrix(tf)
    
    # Get total number of documents (here is user number)
    N = tf.shape[0]
    
    # Calculate IDF (inverse document frequency)
    idf = np.log(N / (1 + np.bincount(tf_idf.col)))

    # TODO: you can try using the original defination of tf-idf
    # Get tf-idf product
    # tf_idf.data = tf_idf.data * idf[tf_idf.col]
    
    # Since terms don’t show up in many documents, we apply a square root penalty over tf to dampen it.
    tf_idf.data = np.sqrt(tf_idf.data) * idf[tf_idf.col] 
    
    return tf_idf

In [13]:
# Generate tf-idf matrix based on user-product-pair matrix
user_product_tfidf_matrix_train = build_tfidf_matrix(user_product_matrix_train).tocsr()
user_product_tfidf_matrix_train

<206209x49677 sparse matrix of type '<class 'numpy.float64'>'
	with 13307953 stored elements in Compressed Sparse Row format>

In [14]:
# Example of tf-idf value
user_product_tfidf_matrix_train[0, 195] # user_id = 1, product_id = 196

10.275263695642945

# User_based collaborative filtering

In [15]:
# User-based recommendation
def get_topK_similar_users(user_id, feature_matrix, k):
    """Find the most k similar users based on similarity."""
    # Get list of cosine similarities
    similarities = cosine_similarity(feature_matrix, feature_matrix[user_id - 1], False)
    
    # Select top K similar users
    top_K_similar_users = heapq.nlargest(k + 1, range(similarities.shape[0]), similarities.toarray().take)[1:]
    top_K_similar_users = [x + 1 for x in top_K_similar_users]
    
    # Return the list excluding the target user
    return top_K_similar_users

def generate_recommendation(user_id, feature_matrix, df_productsPerUser, df_product_frequency, k, n):
    """Find the most n recommended products based on the shopping history of the similar users."""
    # Get top k similar users
    topK_similar_users = get_topK_similar_users(user_id, feature_matrix, k)
    
    # Product popularity is defined as following 2 parts:
    # 1. the number of similar users who buy this product
    # 2. the buying frequency of this product in all users
    # TODO: the popularity definition can be adjusted, like 
    recommended_prods = defaultdict(int)
    user_prods = df_productsPerUser['product_id'][df_productsPerUser['user_id'] == user_id].values[0]
    if type(user_prods) == str:
        user_prods = literal_eval(user_prods)
    for user in topK_similar_users:
        prods = df_productsPerUser['product_id'][df_productsPerUser['user_id'] == user].values
        prods = set() if len(prods) == 0 else prods[0]
        if type(prods) == str:
            prods = literal_eval(prods)
        for prod in prods:
            recommended_prods[prod] += 1
    
    # Get popularity for each prod
    recommended_prods = [(p, (x, int(df_product_frequency[df_product_frequency['product_id'] == x].frequency))) \
                         for (p, x) in recommended_prods.items()]
    
    # Sort the products based on the popularity in the set of similar users
    recommended_prods = sorted(recommended_prods, key = lambda kv : (kv[1], kv[0]), reverse=True)
    return recommended_prods[:n]
    
def report_userBased(recommended_prods, df_products, df_departments, df_aisles):
    '''Prints out the details of the recommended products in a dataframe.'''
    data = {'product_id': [], 'popularity': []}
    for product in recommended_prods:
        data['product_id'].append(product[0])
        data['popularity'].append(product[1])
    df = pd.DataFrame(data, columns=list(data.keys()))
    df = pd.merge(df, df_products, on='product_id', how='inner') # add product details
    df = pd.merge(df, df_departments, on='department_id', how='inner') # add department details
    df = pd.merge(df, df_aisles, on='aisle_id', how='inner') # add aisle details
    return df.sort_values(by='popularity', ascending=False)

# Example of recommendation for user_id == 1

In [16]:
# Constants
similar_user_num = 20 # TODO: adjust the value of k
recommend_prod_num = 10 # TODO: adjust the value of n
user_id = 1

In [17]:
# Use tfidf matrix for similarity calculation
recommended_prods = generate_recommendation(user_id, \
                                            user_product_matrix_train, \
                                            df_productsPerUser_train, \
                                            df_product_frequency, \
                                            similar_user_num, \
                                            recommend_prod_num)
report_userBased(recommended_prods, df_products, df_departments, df_aisles)

Unnamed: 0,product_id,popularity,product_name,aisle_id,department_id,department,aisle
0,196,"(19, 4)",Soda,77,7,beverages,soft drinks
2,12427,"(17, 18)",Original Beef Jerky,23,19,snacks,popcorn jerky
8,13176,"(12, 246)",Bag of Organic Bananas,24,4,produce,fresh fruits
1,46149,"(10, 2572)",Zero Calorie Cola,77,7,beverages,soft drinks
9,6184,"(9, 156)",Clementines,32,4,produce,packaged produce
4,41400,"(7, 30)",Crunchy Oats 'n Honey Granola Bars,3,19,snacks,energy granola bars
5,37710,"(7, 30)",Trail Mix,125,19,snacks,trail mix snack mix
6,10258,"(7, 30)",Pistachios,117,19,snacks,nuts seeds dried fruit
7,31651,"(6, 8)",Extra Fancy Unsalted Mixed Nuts,117,19,snacks,nuts seeds dried fruit
3,46061,"(5, 15)",Popcorn,23,19,snacks,popcorn jerky


In [18]:
# Use tfidf matrix for similarity calculation
recommended_prods = generate_recommendation(user_id, \
                                            user_product_tfidf_matrix_train, \
                                            df_productsPerUser_train, \
                                            df_product_frequency, \
                                            similar_user_num, \
                                            recommend_prod_num)
report_userBased(recommended_prods, df_products, df_departments, df_aisles)

Unnamed: 0,product_id,popularity,product_name,aisle_id,department_id,department,aisle
0,12427,"(15, 4)",Original Beef Jerky,23,19,snacks,popcorn jerky
4,196,"(14, 17)",Soda,77,7,beverages,soft drinks
1,10258,"(12, 246)",Pistachios,117,19,snacks,nuts seeds dried fruit
6,13176,"(10, 2572)",Bag of Organic Bananas,24,4,produce,fresh fruits
9,6184,"(7, 30)",Clementines,32,4,produce,packaged produce
2,41400,"(6, 8)",Crunchy Oats 'n Honey Granola Bars,3,19,snacks,energy granola bars
5,46149,"(6, 8)",Zero Calorie Cola,77,7,beverages,soft drinks
3,37710,"(5, 15)",Trail Mix,125,19,snacks,trail mix snack mix
7,16797,"(5, 15)",Strawberries,24,4,produce,fresh fruits
8,47402,"(4, 329)",Fuji Apples,24,4,produce,fresh fruits


# Testing

In [19]:
def get_recall(rec, tru):
    """Recommendation recall: |{R & P}|/|P| (R - recommended products, P - relevant products)"""
    return len(rec & tru)/len(tru) if len(tru) != 0 else 0
    
def get_precision(rec, tru):
    """Recommendation precision: |{R & P}|/|R| (R - recommended products, P - relevant products)"""
    return len(rec & tru)/len(rec) if len(rec) != 0 else 0

def test_recommender(feature_matrix, df_productsPerUser_test, df_product_frequency, k, n):
    """Test recommender function. (recall and precision)"""
    right_cases, total_cases = 0, 0
    users = df_productsPerUser_test['user_id'].to_list()[:100] # TODO: please remove [:100] for final test
    
    # Variables used for recording
    right_cases, total_cases = 0, 0
    recall_sum, precision_sum = 0, 0
    
    for user in tqdm(users):
        # the user-based recommendation list
        recommended_prods = generate_recommendation(user, feature_matrix, df_productsPerUser_test, df_product_frequency, k, n)
        recommended_prods = set([x for (x, _) in recommended_prods])
        # actual product list
        actual_prods = df_productsPerUser_test[df_productsPerUser_test['user_id'] == user].product_id.tolist()[0]
        if type(actual_prods) == str:
            actual_prods = literal_eval(actual_prods)
        # Check how many right products we recommend
        recall_sum += get_recall(recommended_prods, actual_prods)
        precision_sum += get_precision(recommended_prods, actual_prods)
        right_cases += len(recommended_prods & actual_prods)
        total_cases += len(actual_prods)
    
    # Get average and total value
    print_success('average: (recall, precision) = (%f, %f)' % (recall_sum/len(users), precision_sum/len(users)))
    print_success('total: (recall, precision) = (%f, %f)' % (right_cases/total_cases, right_cases/(n*len(users))))

In [20]:
# Testing of the users with id from 1 to 100 (directly using frequeny matrix as feature matrix)
test_recommender(user_product_matrix_train, 
                 df_productsPerUser_test, 
                 df_product_frequency, 
                 similar_user_num, 
                 recommend_prod_num)

100%|██████████| 100/100 [00:48<00:00,  2.06it/s]

[95m[Recommender Message][0m - [92maverage: (recall, precision) = (0.184941, 0.138000)[0m
[95m[Recommender Message][0m - [92mtotal: (recall, precision) = (0.131429, 0.138000)[0m





In [None]:
# Testing of the users with id from 1 to 100 (using tf-idf matrix as feature matrix)
test_recommender(user_product_tfidf_matrix_train, 
                 df_productsPerUser_test, 
                 df_product_frequency, 
                 similar_user_num, 
                 recommend_prod_num)

 29%|██▉       | 29/100 [00:12<00:32,  2.16it/s]