# Factorization Machine for H&M RecSys Challenge
### by Arseniy, Vladislav and Noah

## Preperation
- load packages and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the data
transactions_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
customers_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
articles_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')

## Filter data

In [None]:
def get_most_bought_articles(data, num_articles=5):
    # Create dataframe that contains the number of times each article has been bought
    articles_counts = data[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
    articles_counts = articles_counts.sort_values(by='count', ascending=False)
        
    most_bought_articles = articles_counts.loc[articles_counts['count'] >= num_articles]['article_id'].values
    
    return most_bought_articles

In [None]:
# Create training dataset with positive examples.
# The training data will contain all transactions starting from 01/07/2020.
# Only items that have been bought at least 10 times will be kept. Also, we
# are only going to compute the information for the customers that appear
# in these transactions.
start_date = pd.to_datetime('2020-07-01')
end_date = pd.to_datetime('2020-09-22')

# Filter transactions by date
transactions_df.t_dat = pd.to_datetime(transactions_df.t_dat)
transactions_df = transactions_df.loc[transactions_df.t_dat >= start_date]
transactions_df = transactions_df.loc[transactions_df.t_dat < end_date]

# Get articles with at least num_articles purchases and remove the rest from the
# transactions
most_bought_articles = get_most_bought_articles(transactions_df, num_articles=10)
transactions_df = transactions_df.loc[transactions_df.article_id.isin(most_bought_articles)]

article_ids = articles_df['article_id'].values
customer_ids = customers_df['customer_id'].values

num_articles = len(article_ids)
num_customers = len(customer_ids)

# Create dictionaries with mapping keys
articles_id_to_idx = dict(zip(article_ids, range(num_articles)))
customers_id_to_idx = dict(zip(customer_ids, range(num_customers)))

train_df = transactions_df.copy()
train_df = train_df[['customer_id', 'article_id']]

num_transactions = train_df.shape[0]

train_df['bought'] = np.ones(num_transactions)

train_df

In [None]:
# Generate negative training samples
# Generate negative examples
np.random.seed(47)

negative_data = pd.DataFrame(
    {
        'article_id': np.random.permutation(train_df.article_id.values),
        'customer_id': train_df.customer_id.values,
        'bought': np.zeros(num_transactions)
    }
)

train_df = pd.concat([train_df, negative_data])
train_df = train_df.sample(frac=1).reset_index(drop=True)

train_df

## Pre-processing and encoding features in sparse matrix for training

In [None]:
from scipy.sparse import csr_matrix, hstack
import numpy as np
import pandas as pd

def preprocess_customers_df(customers_df, transactions_df):
    # Check if active and have club member status
    customers_df[['Active']] = customers_df[['Active']].fillna(value=0) 
    customers_df['club_member_status'] = customers_df['club_member_status'] == 'ACTIVE'

    # 9% of all values have one specific postal code, we will encode this as a feature
    customers_df['common_postal_code'] = customers_df['postal_code'] == '2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c'

    # Replacing missing age with mean value
    customers_df[['age']] = customers_df[['age']].fillna(value=customers_df['age'].mean())

    # Drop columns
    customers_df = customers_df.drop(['FN', 'fashion_news_frequency', 'postal_code'], axis=1)

    # Replace boolean with 1/0
    customers_df['club_member_status'] = customers_df['club_member_status'].astype(int)
    customers_df['common_postal_code'] = customers_df['common_postal_code'].astype(int)
    
    customers_df = customers_df.merge(transactions_df[['customer_id', 'article_id']], on="customer_id")
    
    # Computer average price per item bought by customer
    # needed_df = transactions_df.merge(articles_df[['article_id', 'colour_group_code','index_group_no']], on ='article_id', how='left')
    # avg_price = needed_df.groupby('customer_id').mean()['price']
    # customers_df = customers_df.merge(avg_price, on='customer_id', how='left')
    
    return customers_df

def create_sparse_matrix(transactions_df, articles_df, customers_df, articles_to_idx, customers_to_idx): 
    customers_df = preprocess_customers_df(customers_df, transactions_df)
    
    # Create one hot encoded customers matrix
    cols = np.array([customers_to_idx[customer] for customer in customers_df['customer_id']])
    rows = np.array(list(range(0, len(customers_df))))
    data = np.ones(len(rows))
    csr_customers = csr_matrix((data, (rows, cols)), shape=(len(rows), len(customers_to_idx)))
    
    # Create one hot encoded bought articles matrix
    rows = np.array(list(range(0, len(customers_df))))
    cols = np.array([articles_to_idx[article] for article in customers_df['article_id']])
    data = np.ones(len(rows))
    csr_articles = csr_matrix((data, (rows, cols)), shape=(len(rows), len(articles_to_idx)))
    
    # Concatonate
    sparse_matrix = hstack((csr_customers, csr_articles), format='csr')
    # Age
    sparse_matrix = hstack((sparse_matrix, customers_df['age'].values[:,None]), format='csr')
    # Active
    sparse_matrix = hstack((sparse_matrix, customers_df['Active'].values[:,None]), format='csr')
    # Club member status
    sparse_matrix = hstack((sparse_matrix, customers_df['club_member_status'].values[:,None]), format='csr')
    # Common postal code
    sparse_matrix = hstack((sparse_matrix, customers_df['common_postal_code'].values[:,None]), format='csr')
    # Average price spent
    # sparse_matrix = hstack((sparse_matrix, customers_df['price'].values[:,None]))
    
    return sparse_matrix


In [None]:
from scipy import sparse

sparse_matrix = create_sparse_matrix(train_df, articles_df, customers_df, articles_id_to_idx, customers_id_to_idx)

## Implement the factorization machine

In [None]:
from tqdm import tqdm
from numba import njit
import numba as nb


@njit
def log_loss(bought, pred):
    # Using the log loss prevents the exploting gradient problem
    return np.log(np.exp(-pred * bought) + 1.0)


@njit
def predict_single(data, indices, indptr, i, latent_factors, w0, w, V):
    # The sum of the interactions will be saved so that it can be used later on
    sum_factors = np.zeros(latent_factors)
    summed_squared = np.zeros(latent_factors)

    # Initialize prediction with w0
    prediction = w0

    # Linear product w * x
    for index in range(indptr[i], indptr[i + 1]):
        feature = indices[index]
        prediction += w[feature] * data[index]

    # Interactions product
    for factor in range(latent_factors):
        # squared_sum = 0

        for index in range(indptr[i], indptr[i + 1]):
            feature = indices[index]
            product = V[factor, feature] * data[index]
            sum_factors[factor] += product
            summed_squared[factor] += product * product

        prediction += 0.5 * (sum_factors[factor] * sum_factors[factor] - summed_squared[factor])

    return prediction, sum_factors


@njit
def SGD(data, indices, indptr, latent_factors, w0, w, V, learning_rate, bought, reg_w, reg_v):
    loss = 0

    for i in range(len(bought)):
        prediction, sum_factors = predict_single(data, indices, indptr, i, latent_factors, w0, w, V)
        loss += log_loss(bought[i], prediction)
        loss_gradient = -bought[i] / (np.exp(bought[i] * prediction) + 1.0)

        # Update bias term
        w0 -= learning_rate * loss_gradient

        # Update features bias
        for index in range(indptr[i], indptr[i + 1]):
            feature = indices[index]
            w[feature] -= learning_rate * (loss_gradient * data[index] + 2 * reg_w * w[feature])

        # Update latent factors
        for factor in range(latent_factors):
            for index in range(indptr[i], indptr[i + 1]):
                feature = indices[index]
                term = sum_factors[factor] - V[factor, feature] * data[index]
                v_gradient = loss_gradient * data[index] * term
                V[factor, feature] -= learning_rate * (v_gradient + 2 * reg_v * V[factor, feature])

        # if i % 100000 == 0:
        #    print('Loss at step ', i, ': ', loss)

    loss /= data.shape[0]

    return loss, w0, w, V


class FactorizationMachine:
    def __init__(self, transactions_matrix, bought, latent_factors):
        self.transactions_matrix = transactions_matrix
        self.bought = bought
        self.latent_factors = latent_factors

        self.n_samples, self.n_variables = self.transactions_matrix.shape

    def fit(self, n_epochs=10, learning_rate=0.001, lmbda=0.1, reg_w=0.01, reg_latent=0.01, verbose=True):
        """ We decompose the R matrix into to submatrices using the training data """
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.lmbda = lmbda

        np.random.seed(47)
        # Initialize training variables
        self.w0 = 0

        self.reg_w = reg_w
        self.reg_latent = reg_latent

        self.w = np.random.normal(0, 1, self.n_variables)
        self.V = np.random.normal(0, scale=1 / np.sqrt(self.latent_factors),
                                  size=(self.latent_factors, self.n_variables))

        data = self.transactions_matrix.data
        indices = self.transactions_matrix.indices
        indptr = self.transactions_matrix.indptr
        bought = self.bought
        self.history_ = []
        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            
            loss, self.w0, self.w, self.V = SGD(
                data=data, indices=indices, indptr=indptr, latent_factors=self.latent_factors,
                w0=self.w0, w=self.w, V=self.V, learning_rate=self.learning_rate,
                bought=bought, reg_w=self.reg_w, reg_v=self.reg_latent
            )

            print(f'Loss: {loss}')
            self.history_.append(loss)

    def predict_proba(self, data):
        """
        Probability estimates. The returned estimates for
        all classes are ordered by the label of classes.

        Paramters
        ---------
        X : scipy sparse csr_matrix, shape [n_samples, n_features]
            Data in sparse matrix format.

        Returns
        -------
        proba : 2d ndarray, shape [n_samples, n_classes]
            The probability of the sample for each class in the model.
        """
        pred = self.predict_all_sample(data)
        pred_proba = 1.0 / (1.0 + np.exp(-pred))
        proba = np.vstack((1 - pred_proba, pred_proba)).T
        return proba

    def predict_all_sample(self, data):
        """Predict vectorized for all samples"""
        linear_output = data * self.w
        v = self.V.T
        term = (data * v) ** 2 - (data.power(2) * (v ** 2))
        factor_output = 0.5 * np.sum(term, axis=1)
        return self.w0 + linear_output + factor_output

    def predict(self, data):
        """
        Predict class labels for samples in X.

        Parameters
        ----------
        X : scipy sparse csr_matrix, shape [n_samples, n_features]
            Data in sparse matrix format.

        Returns
        -------
        Predicted class label per sample.
        """
        pred_proba = self.predict_proba(data)[:, 1]
        return pred_proba.round().astype(np.int)

    def __plot_learning_curves__(self):
        # change default style figure and font size
        plt.rcParams['figure.figsize'] = 8, 6
        plt.rcParams['font.size'] = 12

        # one quick way to check that we've implemented
        # the gradient descent is to ensure that the loss
        # curve is steadily decreasing
        plt.plot(self.history_)
        plt.title('Loss Curve Per Iteration')
        plt.xlabel('Iterations')
        plt.ylabel('Loss')
        plt.show()

## Training

In [None]:
fm = FactorizationMachine(transactions_matrix = sparse_matrix, bought = train_df.bought.values, latent_factors = 20)
fm.fit(learning_rate=0.001)

In [None]:
fm.__plot_learning_curves__()

## Prepare for predictions

In [None]:
import pandas as pd
import os 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

idx_to_train_article_id = {i: article for i, article in enumerate(np.unique(train_df.article_id))}

# Generate 12 suggestions for a user-products matrix
def generate_suggestions(model, matrix, n = 12):
    proba = model.predict_proba(matrix)
    index = pd.DataFrame(proba, columns=['No', 'Yes']).sort_values('Yes', ascending = False)[0:n]['Yes'].index
    value = pd.DataFrame(proba, columns=['No', 'Yes']).sort_values('Yes', ascending = False)[0:n]['Yes'].values
    #return index, value
    return [''.join('0' + str(idx_to_train_article_id[i])) for i in index]


# Generate submission
def submission(model, res_article, res_customer, customer_matrix):
    submission = {}
    for i in tqdm(range(len(res_customer))):
        customer_id = res_customer[i]
        customer_matrix = get_user_matrix(i)
        recommendations = generate_suggestions(model, res_article, customer_matrix, n = 12)
        submission[customer_id] = ' '.join('0' + str(x) for x in recommendations)  
    return submission

def display_articles(articles):
    w = 10
    h = 10
    fig = plt.figure(figsize=(8, 8))
    columns = 4
    rows = 3
    for i, article in enumerate(articles[0:12]):
        path = '../input/h-and-m-personalized-fashion-recommendations/images/' + article[0:3] + '/' + article + '.jpg'
        if not os.path.isfile(path):
            continue
        fig.add_subplot(rows, columns, i + 1)
        img = mpimg.imread(path)
        plt.imshow(img)
    plt.show()

def get_user_matrix(customer_id, customers_df, transactions_df):
    n_rows = len(np.unique(train_df.article_id))
    
    customers_df = preprocess_customers_df(customers_df, transactions_df)
    
    # Create CSR matrix for customers
    rows = np.arange(n_rows)
    cols = [customers_id_to_idx[customer_id]] * n_rows
    data = np.ones(n_rows)
    csr_customers = sparse.csr_matrix((data, (rows, cols)), shape=(n_rows, len(customers_id_to_idx)))

    # Create CSR matrix for articles
    rows = np.arange(n_rows)
    cols = [articles_id_to_idx[article] for article in np.unique(train_df.article_id)]
    data = np.ones(n_rows)
    csr_articles = sparse.csr_matrix((data, (rows, cols)), shape=(n_rows, len(articles_id_to_idx)))

    customer_data = customers_df[customers_df['customer_id'] == customer_id].iloc[0]        
    
    # Concatenate matrices in CSR format
    sparse_matrix = sparse.hstack((csr_customers, csr_articles), format='csr')
    # Age
    sparse_matrix = hstack((sparse_matrix, [[customer_data['age']]] * n_rows), format='csr')
    # Active
    sparse_matrix = hstack((sparse_matrix, [[customer_data['Active']]] * n_rows), format='csr')
    # Club member status
    sparse_matrix = hstack((sparse_matrix, [[customer_data['club_member_status']]] * n_rows), format='csr')
    # Common postal code
    sparse_matrix = hstack((sparse_matrix, [[customer_data['common_postal_code']]] * n_rows), format='csr')
    
    return sparse_matrix


In [None]:
sample = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

## Lets get some sample recommendations

In [None]:
preprocessed_customers_df = preprocess_customers_df(customers_df, transactions_df)

def get_random_recommendations(n=5):
    counter = 0
    while counter < n:
        customer_id = customers_df.iloc[np.random.randint(0, len(customers_df))]['customer_id']
        customer_info = preprocessed_customers_df.loc[preprocessed_customers_df.customer_id == customer_id]
        if len(customer_info) == 0:
            continue
        print("Customer info:")
        print(customer_info.iloc[0])
        user_matrix = get_user_matrix(customer_id, customers_df, transactions_df)
        recommendations = generate_suggestions(fm, user_matrix)
        print("Previously bought")
        display_articles(['0' + str(article) for article in train_df.loc[train_df.customer_id == customer_id]['article_id'].values])
        print("Recommendations")
        display_articles(recommendations)
        counter += 1

In [None]:
get_random_recommendations()

#### Doesn't look too bad, right? Let's continue!

## Predictions for all users

### Time decaying baseline model for default recommendations
As in our previous notebook, we will use the time decaying baseline model for customers where we dont have any info

In [None]:
data = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={'article_id':str})

In [None]:
import datetime

data["t_dat"] = pd.to_datetime(data["t_dat"])
train1 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,16)) & (data['t_dat'] < datetime.datetime(2020,9,23))]
train2 = data.loc[(data["t_dat"] >= datetime.datetime(2020,9,8)) & (data['t_dat'] < datetime.datetime(2020,9,16))]
train3 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,31)) & (data['t_dat'] < datetime.datetime(2020,9,8))]
train4 = data.loc[(data["t_dat"] >= datetime.datetime(2020,8,23)) & (data['t_dat'] < datetime.datetime(2020,8,31))]

# List of all purchases per user (has repetitions)
positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

train = pd.concat([train1, train2], axis=0)
train['pop_factor'] = train['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
popular_items_group = train.groupby(['article_id'])['pop_factor'].sum()

_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

In [None]:
from collections import Counter

popular_items = list(popular_items)

def get_popularity_based_prediction(user):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    if user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_output += list(most_common_items_of_user.keys())[:12]
    
    user_output += list(popular_items[:12 - len(user_output)])
    return user_output

### Combine models and get predictions

In [None]:
preprocessed_customers_df = preprocess_customers_df(customers_df, transactions_df)

def get_predictions(customers):
    all_recommendations = []
    for customer_id in tqdm(customers):
        customer_info = preprocessed_customers_df.loc[preprocessed_customers_df.customer_id == customer_id]
        if len(customer_info) == 0:
            recommendations = get_popularity_based_prediction(customer_id)
        else:
            user_matrix = get_user_matrix(customer_id, customers_df, transactions_df)
            recommendations = generate_suggestions(fm, user_matrix)
        all_recommendations.append(" ".join(recommendations))
    return pd.DataFrame({'customer_id': customers, 'prediction': all_recommendations})
        

In [None]:
predictions = get_predictions(sample['customer_id'].values[:500])

In [None]:
predictions

In [None]:
predictions.to_csv('submission.csv', index=False)

## Conclusion

In this notebook, we have shown how to implement a factorization machine with some additional encoded features. At this point, we want to note that we have also encoded the last 5 bought articles for each customer which even seemed to have a lower loss (Version 3 of this notebook) but we had to leave it out due to time constraints. During training, we have noticed that the training loss did not decrease a lot and thus this issue could be further explored. We have visualized recommendations and previously bought items for a random sample of customers. We could conclude that subjectivly, the predictions looked good, although some items like socks were predicted every time. Unfortuanetly, our algorithm is not very performant for inference and thus we had to cap the number of predicitons to 500. 