# H&M - Implicit ALS model

[Implicit](https://github.com/benfred/implicit/) is a library for recommender models.

In this notebook we are going to use ALS (Alternating Least Squares).

# Please, upvote if you find this useful!


# Note :
1) We will be using latest version of implicit library (released just few days back).

2) for cold start / unseen customers we will use [Heng Zheng](https://www.kaggle.com/hengzheng)'s [time is our best friend v2](https://www.kaggle.com/hengzheng/time-is-our-best-friend-v2/).

In [None]:
# Installing latest implicit library for ALS

!pip install --upgrade implicit

In [None]:
# Importing required libraries 

import os
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import implicit
from implicit.evaluation import mean_average_precision_at_k



In [None]:
%%time

# Importing data

transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
sample_submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype={'article_id': str})


In [None]:
articles

In [None]:
customers

In [None]:
transactions

In [None]:
# Trying with less data:

transactions = transactions[transactions['t_dat'] > '2020-09-14']
transactions.shape

In [None]:
# For validation : 3 weeks of training and 1 week for validation
# For submission : 4 weeks of training

transactions['t_dat'].max()

In [None]:
# Assigning incremental ids to customers and articles

all_customers = customers['customer_id'].unique().tolist()
all_articles = articles['article_id'].unique().tolist()

customer_ids = dict(list(enumerate(all_customers)))
article_ids = dict(list(enumerate(all_articles)))

transactions['customer_id'] = transactions['customer_id'].map({u: uidx for uidx, u in customer_ids.items()})
transactions['article_id'] = transactions['article_id'].map({i: iidx for iidx, i in article_ids.items()})

del customers, articles

In [None]:
# Creating coo_matrix (customer x article) and csr matrix (customer x article)

row = transactions['customer_id'].values
col = transactions['article_id'].values
data = np.ones(transactions.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(all_customers), len(all_articles)))
coo_train

In [None]:
%%time

model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2, use_gpu=True, calculate_training_loss=True, random_state=7)
model.fit(coo_train)

In [None]:
def to_customer_article_coo(transactions):
    """ Turn a dataframe with transactions into a COO sparse articles x customers matrix"""
    row = transactions['customer_id'].values
    col = transactions['article_id'].values
    data = np.ones(transactions.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(all_customers), len(all_articles)))
    return coo


def split_data(transactions, validation_days=7):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
    validation_cut = transactions['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = transactions[transactions['t_dat'] < validation_cut]
    df_val = transactions[transactions['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(transactions, validation_days=7):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (customers x articles)
            csr_train: training data in CSR sparse format and as (customers x articles)
            csr_val:  validation data in CSR sparse format and as (customers x articles)
    
    """
    df_train, df_val = split_data(transactions, validation_days=validation_days)
    coo_train = to_customer_article_coo(df_train)
    coo_val = to_customer_article_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=7,
                                                 use_gpu=True)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated articles, which is the case.
    # TODO: change MAP@12 to a library that allows repeated articles in prediction
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12

In [None]:
matrices = get_val_matrices(transactions)

In [None]:
%%time
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")


del matrices

In [None]:
best_params

In [None]:
#Training over the full dataset

coo_train = to_customer_article_coo(transactions)
csr_train = coo_train.tocsr()

In [None]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=7,
                                                 use_gpu=True)
    model.fit(coo_train, show_progress=show_progress)
    return model

best_params

In [None]:
model = train(coo_train, **best_params)

In [None]:
# Submission

heng_df = pd.read_csv('../input/heng-zhengs-time-is-our-best-friend-v2-submission/not_so_fancy_but_fast_benchmark.csv')


def submit(model, csr_train, custs, heng_df, submission_name="submissions.csv"):
    preds = []
    batch_size = 2000

    for startidx in range(0, len(custs), batch_size):
        
        batch = custs[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=False)
        
        for i, customerid in enumerate(batch):
            customer_id = customer_ids[customerid]
            customer_articles = ids[i]
            articleids = [article_ids[article_id] for article_id in customer_articles]
            preds.append((customer_id, ' '.join(articleids)))

    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
            
    # Fill with better base model than ALS cold-start rec
    df_preds = pd.merge(heng_df, df_preds, how='left', on='customer_id', suffixes=('_fill', '_als'))
    df_preds.loc[~df_preds['prediction_als'].isnull(), 'prediction'] = df_preds['prediction_als']
    df_preds.loc[df_preds['prediction_als'].isnull(), 'prediction'] = df_preds['prediction_fill']
    df_preds = df_preds[['customer_id', 'prediction']]
    df_preds.to_csv(submission_name, index=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

In [None]:
transactions_customers = transactions['customer_id'].unique().tolist()

len(transactions_customers)

In [None]:
%%time

df_preds = submit(model, csr_train, transactions_customers, heng_df)