## This experiment was performed using LightFM which a very popular recommender module and it has support to take in different data modalities such as text, image, graphical, etc. Please check out their official documentation in the link mentioned below:  

The objective of this experiment is to find the best parameters which give the best precision@12 for the problem at hand.

Please refer to notebooks where you will be able to visualize different experimentations based on light FM:  
1. Light FM with only customer article interactions:  
Link: https://www.kaggle.com/rickykonwar/h-m-lightfm-nofeatures  

2. Light FM with customer article interaction + 1 article feature (product group name)  
Link: https://www.kaggle.com/rickykonwar/h-m-lightfm-1articlefeature  

3. Light FM with customer article interaction + 1 article feature (product group name) + article description embeddings  
Link: https://www.kaggle.com/rickykonwar/h-m-lightfm-2articlefeatures  

Link to LightFM documentation
making.lyst.com/lightfm/docs/home.html  

It incorporates Hyper Parameter tuning for the problem statement

Hope you like this notebook, please feel free to vote for this notebook

## Importing Required Libraries

In [None]:
# Importing Libraries
import sys, os
import re
import tqdm
import time
import pickle
import random
import datetime
import itertools

import pandas as pd
import numpy as np
import scipy.sparse as sparse
%matplotlib inline
import matplotlib.pyplot as plt

# lightfm 
from lightfm import LightFM # model
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split

# multiprocessing for inferencing
from multiprocessing import Pool

In [None]:
os.environ["openblas_set_num_threads"] = "1"
data_path = r'../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'
customer_data_path = r'../input/h-and-m-personalized-fashion-recommendations/customers.csv'
article_data_path = r'../input/h-and-m-personalized-fashion-recommendations/articles.csv'
submission_data_path = r'../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv'

In [None]:
# Data Extraction
def create_data(datapath, data_type=None):
    if data_type is None:
        df = pd.read_csv(datapath)
    elif data_type == 'transaction':
        df = pd.read_csv(datapath, dtype={'article_id': str}, parse_dates=['t_dat'])
    elif data_type == 'article':
        df = pd.read_csv(datapath, dtype={'article_id': str})
    return df

In [None]:
#%%time

# Load all sales data (for 3 years starting from 2018 to 2020)
# ALso, article_id is treated as a string column otherwise it 
# would drop the leading zeros while reading the specific column values
transactions_data=create_data(data_path, data_type='transaction')
print(transactions_data.shape)

# # Unique Attributes
print(str(len(transactions_data['t_dat'].drop_duplicates())) + "-total No of unique transactions dates in data sheet")
print(str(len(transactions_data['customer_id'].drop_duplicates())) + "-total No of unique customers ids in data sheet")
print(str(len(transactions_data['article_id'].drop_duplicates())) + "-total No of unique article ids courses names in data sheet")
print(str(len(transactions_data['sales_channel_id'].drop_duplicates())) + "-total No of unique sales channels in data sheet")

In [None]:
transactions_data.head()

In [None]:
transactions_data.info()

In [None]:
%%time

# Load all Customers
customer_data=create_data(customer_data_path)
print(customer_data.shape)

print(str(len(customer_data['customer_id'].drop_duplicates())) + "-total No of unique customers ids in customer data sheet")

In [None]:
customer_data.head()

In [None]:
customer_data.info()

In [None]:
%%time

# Load all Customers
article_data=create_data(article_data_path, data_type='article')
print(article_data.shape)

print(str(len(article_data['article_id'].drop_duplicates())) + "-total No of unique article ids in article data sheet")

In [None]:
article_data.head()

In [None]:
article_data.info()

## Capturing Seasonal Effect by Limiting the transaction date
Based on notebook with link: https://www.kaggle.com/tomooinubushi/folk-of-time-is-our-best-friend/notebook

In [None]:
transactions_data = transactions_data[transactions_data['t_dat'] > '2020-08-21']
transactions_data.shape

## Splitting transaction data to training and validation set

In [None]:
train_start_date = transactions_data.t_dat.min()
split_date = transactions_data.t_dat.max() - datetime.timedelta(days = 7)
train_transaction_data = transactions_data[(transactions_data.t_dat <= split_date) & (transactions_data.t_dat >= train_start_date)].copy()
test_transaction_data = transactions_data[transactions_data.t_dat > split_date].copy()

print(train_transaction_data.shape)
print(test_transaction_data.shape)

## Aggregating Customers and Articles irrespective of transaction dates

In [None]:
transactions_data = transactions_data.groupby(['customer_id','article_id']).agg({'price':'sum','t_dat':'count'}).reset_index()
transactions_data = transactions_data[['customer_id','article_id','price','t_dat']]

## Generating user and article index mapping dictionaries

In [None]:
def get_customers_list():
    # Creating a list of users
    return np.sort(customer_data['customer_id'].unique())

def get_articles_list():
    # Creating a list of courses 
    item_list = article_data['article_id'].unique()
    return item_list

def id_mappings(customers_list, articles_list):
    """
    
    Create id mappings to convert user_id, item_id, and feature_id
    
    """
    customer_to_index_mapping = {}
    index_to_customer_mapping = {}
    for customer_index, customer_id in enumerate(customers_list):
        customer_to_index_mapping[customer_id] = customer_index
        index_to_customer_mapping[customer_index] = customer_id
        
    article_to_index_mapping = {}
    index_to_article_mapping = {}
    for article_index, article_id in enumerate(articles_list):
        article_to_index_mapping[article_id] = article_index
        index_to_article_mapping[article_index] = article_id
        
    return customer_to_index_mapping, index_to_customer_mapping, \
           article_to_index_mapping, index_to_article_mapping

In [None]:
customers = get_customers_list()
articles = get_articles_list()

In [None]:
customers

In [None]:
articles

In [None]:
# Generate mapping, LightFM library can't read other than (integer) index
customer_to_index_mapping, index_to_customer_mapping, \
article_to_index_mapping, index_to_article_mapping = id_mappings(customers, articles)

## Generate Customer Article Interaction Matrix

In [None]:
def get_customer_article_interaction(customer_article_amt_df, agg_col_name='price'):
    #start indexing
    customer_article_amt_df["customer_id"] = customer_article_amt_df["customer_id"]
    customer_article_amt_df["article_id"] = customer_article_amt_df["article_id"]
    customer_article_amt_df[agg_col_name] = customer_article_amt_df[agg_col_name]

    # Preprocessing dataframe created
    customer_article_amt_df = customer_article_amt_df.rename(columns = {'price':'total_amount_spent', 't_dat': 'total_no_of_transactions'})

    # Replace Amount Column with category codes 
    if agg_col_name.__eq__('price'):
        customer_article_amt_df['total_amount_spent'] = customer_article_amt_df['total_amount_spent'].astype('category')
        customer_article_amt_df['total_amount_spent'] = customer_article_amt_df['total_amount_spent'].cat.codes
    elif agg_col_name.__eq__('t_dat'):
        customer_article_amt_df['total_no_of_transactions'] = customer_article_amt_df['total_no_of_transactions'].astype('category')
        customer_article_amt_df['total_no_of_transactions'] = customer_article_amt_df['total_no_of_transactions'].cat.codes

    return customer_article_amt_df

def get_interaction_matrix(df, df_column_as_row, df_column_as_col, 
                        df_column_as_value, row_indexing_map, col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = df[df_column_as_value].values
    
    return sparse.coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))


### Customer Article Interaction based on Amount Spent

In [None]:
# Create customer and article interaction dataframe based on total amount spent
customer_to_article_amt = get_customer_article_interaction(customer_article_amt_df = transactions_data[['customer_id','article_id','price']])
print(customer_to_article_amt.shape)                                                  

In [None]:
customer_to_article_amt.head()

In [None]:
# Generate customer_article_interaction_matrix for train data
customer_to_article_interaction_amt = get_interaction_matrix(customer_to_article_amt, "customer_id", "article_id", "total_amount_spent", \
                                                            customer_to_index_mapping, article_to_index_mapping)

In [None]:
customer_to_article_interaction_amt

### Customer Article Interaction based on Transaction Counts

In [None]:
# Create customer and article interaction dataframe based on total number of transactions made
customer_to_article_tdate = get_customer_article_interaction(customer_article_amt_df = transactions_data[['customer_id','article_id','t_dat']],
                                                            agg_col_name='t_dat')
print(customer_to_article_tdate.shape)     

In [None]:
customer_to_article_tdate.head()

In [None]:
# Generate customer_article_interaction_matrix for train data
customer_to_article_interaction_tdat = get_interaction_matrix(customer_to_article_tdate, "customer_id", "article_id", "total_no_of_transactions", \
                                                            customer_to_index_mapping, article_to_index_mapping)

In [None]:
customer_to_article_interaction_tdat

## Hyperparameter Tuning using Random Search

In [None]:
def sample_hyperparameters():
    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }

### Sampling Hyperparmeters Function

### Perform Random Search

Train and Test Interactions are provided as input parameters to the function including the random samples to generate and number of threads to use to perform model training.  

Output would be the precision score, set of hyperprameters and the model

In [None]:
def random_search(train_interactions, test_interactions, num_samples=50, num_threads=1):
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train_interactions, epochs=num_epochs, num_threads=num_threads)

        score = precision_at_k(model, test_interactions, train_interactions=train_interactions, k=12, num_threads=num_threads).mean()
        
        print(score)

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

### Initiating Storage Dictionary

In [None]:
optimized_dict={}

### Splitting the primary dataset into train and test sets based on amount spent

In [None]:
sparse_customer_article_train, sparse_customer_article_test = random_train_test_split(customer_to_article_interaction_amt, test_percentage=0.2, random_state=42)

In [None]:
(score, hyperparams, model) = max(random_search(train_interactions = sparse_customer_article_train, 
                                                test_interactions = sparse_customer_article_test, 
                                                num_threads = 4), key=lambda x: x[0])

In [None]:
print("Best score {} at {}".format(score, hyperparams))

In [None]:
optimized_dict['Amount_Spent'] = {'score': score, 
                                  'params': hyperparams}

### Splitting the primary dataset into train and test sets based on transaction count

In [None]:
sparse_customer_article_train, sparse_customer_article_test = random_train_test_split(customer_to_article_interaction_tdat, test_percentage=0.2, random_state=42)

In [None]:
sparse_customer_article_train

In [None]:
sparse_customer_article_test

In [None]:
(score, hyperparams, model) = max(random_search(train_interactions = sparse_customer_article_train, 
                                                test_interactions = sparse_customer_article_test, 
                                                num_threads = 4), key=lambda x: x[0])

In [None]:
print("Best score {} at {}".format(score, hyperparams))

In [None]:
optimized_dict['Transaction_Counts'] = {'score': score, 
                                       'params': hyperparams}

In [None]:
print(optimized_dict)

## Saving the Optimized Params

In [None]:
with open('optimized_dict.pkl', 'wb') as f:
    pickle.dump(optimized_dict, f)