In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# (A) - Model Building and Experiments

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM, evaluation, cross_validation
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl

In [3]:
def create_interaction_matrix(df, customer_col, product_col, purchase_column, norm = False):
    '''
    Function to create an interaction matrix dataframe in a format required by LightFM.
    This function will take the below inputs and return a matrix of size M x N, where M 
    is the number of unique customers and N is the number of unique products.
    
    Inputs -
        - df = Pandas DataFrame containing customer-product interactions (user-item)
        - customer_col = column name containing customer's identifier
        - product_col = column name containing product's identifier
        - purchase_column col = column name containing customer purchase on interaction with a given product
        - norm (optional) = True if a normalization of purchases are needed

    Outputs - 
        - Pandas dataframe with customer-product interactions ready to be fed in a recommendation algorithm
    '''
    interactions = pd.pivot_table(df, values=purchase_column, index=customer_col, columns=product_col).fillna(0)
    if norm:
        interactions = (interactions-interactions.min())/(interactions.max()-interactions.min())

    return interactions


def create_customer_dict(interactions):
    '''
    Function to create a customer dictionary based on their 
    customer_ids and index position in interaction dataset
    
    Inputs - 
        interactions - dataset created by create_interaction_matrix() function
        
    Outputs -
        customer_dict - Dictionary type output with, key = customer_ids, values =  corresponding index position in the interaction matrix
    '''
    customer_id = list(interactions.index)
    customer_dict = {}
    counter = 0 
    for i in customer_id:
        customer_dict[i] = counter
        counter += 1
    return customer_dict


def create_product_dict(interactions):
    '''
    Function to create an product dictionary based on their product_id and product name
    
    Inputs - 
        interactions - dataset created by create_interaction_matrix() function
        
    Outputs -
        product_dict = Dictionary type output with, key = product_ids, values =  corresponding index position in list(columns) in the interaction matrix
    '''
    product_id = list(interactions.columns)
    product_dict = {}
    counter = 0 
    for i in product_id:
        product_dict[i] = counter
        counter += 1
    return product_dict


def train_MF_model(interactions, n_components=30, loss='warp', epoch=30, n_jobs = 4, learning_schedule = 'adadelta', learning_rate=0.0025):
    '''
    Function to run matrix-factorization algorithm on the prepared datasets.
    Inputs -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define product and customer
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
        
    Outputs  -
        Model - Trained model
    '''

    model = LightFM(no_components = n_components, 
                    learning_rate=0.0025, 
                    learning_schedule = learning_schedule,
                    loss=loss,
                    random_state = 42)
    
    model.fit(interactions, 
              epochs=epoch, 
              num_threads = n_jobs,
              verbose = False)
    
    return model


def load_dataset(df_path, date_column):
    """
    This function will be used to load the train and
    test datasets.
    
    Inputs - 
        - df_path - Path of the dataset
        - date_column - Column name of the date column for parsing
    
    Outputs -
        - df - returns a dataframe object
    """

    df = pd.read_csv(df_path, parse_dates=[date_column])
    
    return df


def sample_recommendation_of_products_to_customer(model, 
                                                  sparse_interactions, 
                                                  customer_id, 
                                                  customer_dict, 
                                                  X_train_interactions_columns, 
                                                  X_train_interactions_index,
                                                  product_dict, 
                                                  threshold = 0, 
                                                  top_k_products_to_recommend = 3, 
                                                  top_k_purchased_products = 3):
    '''
    Function to produce top K customer recommendations
    
    Inputs - 
        - model = Trained matrix factorization model
        - sparse_interactions = sparse dataset used for training the model
        - X_train_interactions_columns - columns of the original non csr interaction df
        - X_train_interactions_index - index of the original non csr interaction df
        - customer_id = customer ID for which we need to generate recommendation
        - customer_dict = Dictionary type input containing customer_ids as key and index positions as value
        - product_dict = Dictionary type input containing product_id as key and corresponding index position in list(columns) of interaction_matrix as value
        - threshold = value above which the purchase is favorable in new interaction matrix
        - top_k_products_to_recommend = Number of output recommendation needed
        - top_k_purchased_products = Number of top products the customaer has purchased
        
    Outputs - 
        - top_k_recommendations - returns the top k recommendations for a given customer
        - top_products_purchased_before_by_this_customer - returns top k products purchased by the customer
        
    '''
    
    interactions = pd.DataFrame.sparse.from_spmatrix(sparse_interactions, columns=X_train_interactions_columns)
    interactions.index = X_train_interactions_index
    interactions = interactions.sparse.to_dense()    
    n_customers, n_products = interactions.shape
    customer_index_in_dic = customer_dict[customer_id]
    scores = pd.Series(model.predict(customer_index_in_dic,np.arange(n_products)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    known_products = list(pd.Series(interactions.loc[customer_id,:] [interactions.loc[customer_id,:] > threshold].index).sort_values(ascending=False))
    scores = [x for x in scores if x not in known_products]
    
    top_k_recommendations = scores[0:top_k_products_to_recommend]
    top_products_purchased_before_by_this_customer = known_products[:top_k_purchased_products]
    
    return top_k_recommendations, top_products_purchased_before_by_this_customer


def create_customer_emdedding_distance_matrix(model, sparse_X_train_interactions, X_train_interactions_index, X_train_interactions_columns):
    '''
    Function to create customer-customer distance embedding matrix
    
    Inputs -
        - model = Trained matrix factorization model
        - sparse_X_train_interactions = dataset used for training the model
        - X_train_interactions_index - index of X_train_interactions
        - X_train_interactions_columns - columns of X_train_interactions
        
    Outputs -
        - customer_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w customers
    '''
    
    interactions = pd.DataFrame.sparse.from_spmatrix(sparse_X_train_interactions, columns=X_train_interactions_columns)
    interactions.index = X_train_interactions_index
    interactions = interactions.sparse.to_dense() 
    df_customer_norm_sparse = sparse.csr_matrix(model.user_embeddings)
    similarities = cosine_similarity(df_customer_norm_sparse)
    customer_emdedding_distance_matrix = pd.DataFrame(similarities)
    customer_emdedding_distance_matrix.columns = interactions.index
    customer_emdedding_distance_matrix.index = interactions.index
    return customer_emdedding_distance_matrix


def customer_customer_recommendation(customer_emdedding_distance_matrix, customer_id, customer_dict, n_customers = 5, show = True):
    '''
    Function to create customer-customer recommendation
    
    Inputs - 
        - customer_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w customers
        - customer_id  = customer ID for which we need to generate recommended customers
        - customer_dict = Dictionary type input containing customer_id as key and customer_name as value
        - n_customers = Number of customers needed as an output
    Outputs -
        - recommended_customers = List of recommended customers
    '''
    recommended_customers = list(pd.Series(customer_emdedding_distance_matrix.loc[customer_id,:].sort_values(ascending = False).head(n_customers+1).index[1:n_customers+1]))

    print("Top {} customer similar to the above customer: ".format(n_customers))
    print(recommended_customers)
    
    return recommended_customers


def return_common_products_in_train_and_holdout(X_train, X_holdout):
    '''
    Function to find out common product ids present across X_train
    and X_holdout datasets. This is done to create same features dimensions
    across both the matrices, otherwise I was not able to evaluate the
    holdout datasets.
    
    Inputs - 
        - X_train = Given dataset for training (in pandas DataFrame format)
        - X_holdout = Given dataset for holdout (in pandas DataFrame format)

    Outputs -
        - common_items = List of common products present in both the datasets
    '''
    items_train = list(X_train['Product_num'].unique())
    items_holdout = list(X_holdout['Product_num'].unique())
    common_items = list(set(items_train).intersection(set(items_holdout)))
    
    return common_items    

def evaluate_model(model, data, eval_stage = 'Training'):
    '''
    Function used for evaluating model performances on train set
    and cross validation dataset.
    
    Required Input - 
        - data = data should be in CSR format
        - eval_stage  = Which phase of experiment is this? Train, cross validation, holdout etc.
        - model = Dictionary type input containing customer_id as key and customer_name as value

    Expected Output -
        - prints the AUC, Recall@K and Precisio@K values
    '''
    
    print("{} AUC: ".format(eval_stage), evaluation.auc_score(model, data, num_threads=8).mean())
    print("{} Recall@K: ".format(eval_stage), evaluation.recall_at_k(model, data, k=3, num_threads=8).mean())
    print("{} Precision@K: ".format(eval_stage), evaluation.precision_at_k(model, data, k=3, num_threads=8).mean())
    print()

In [4]:
#X_test_holdout is a holdout dataset, shall be only used for final evaluation and should not be part of the training methodologies
train_df_path = 'data/prepared_datasets_for_training_and_evaluation/reco_assignment_training_merged_duplicates.csv'
date_column = 'Tran_dt'
X_train = load_dataset(train_df_path, date_column)

In [5]:
# Creating interaction matrix for X_train data
X_train_interactions = create_interaction_matrix(df = X_train,
                                         customer_col = 'Customer_num',
                                         product_col = 'Product_num',
                                         purchase_column = 'Total_Tran_qty')

X_train_interactions_columns = list(X_train_interactions.columns)
X_train_interactions_index = list(X_train_interactions.index)

In [6]:
print("Shape of X_train_interactions: ", X_train_interactions.shape)
X_train_interactions.head()

Shape of X_train_interactions:  (9835, 17411)


Product_num,P_10,P_1000,P_10002,P_10007,P_10008,P_10009,P_1001,P_10010,P_10011,P_10012,...,P_9978,P_9979,P_998,P_9980,P_9983,P_9984,P_9988,P_999,P_9991,P_9992
Customer_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_100082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Create Train customer Dict
customer_dict_train = create_customer_dict(interactions=X_train_interactions)

# Create Train product Dict
product_dict_train = create_product_dict(interactions=X_train_interactions)

In [8]:
pkl.dump(customer_dict_train, open('data/interaction_matrices/customer_dict_train.pkl','wb'))
pkl.dump(product_dict_train, open('data/interaction_matrices/product_dict_train.pkl','wb'))
pkl.dump(X_train_interactions_columns, open('data/interaction_matrices/X_train_interactions_columns.pkl','wb'))
pkl.dump(X_train_interactions_index, open('data/interaction_matrices/X_train_interactions_index.pkl','wb'))
sparse.save_npz("data/interaction_matrices/sparse_X_train_interactions.npz", sparse.csr_matrix(X_train_interactions.values))

In [9]:
#Create train and validation COO matrices from X_train_interactions, such that the train and validation coo matrices shapes are equal, i.e. (9835, 13082)
data_train, data_val = cross_validation.random_train_test_split(sparse.csr_matrix(X_train_interactions.values), test_percentage=0.2, random_state=None)

print("Shape of training data matrix: ", data_train.shape)
print("Shape of cross validation data matrix: ", data_val.shape)

Shape of training data matrix:  (9835, 17411)
Shape of cross validation data matrix:  (9835, 17411)


## Experiments with different models

### Model 1

In [10]:
#Training the recommendation model
model_1 = train_MF_model(data_train, n_components=100, loss='warp', 
                                epoch=100, n_jobs = 8, learning_schedule = 'adadelta', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_1, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_1, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_1, open('models/model_1.pkl','wb'))

Training AUC:  0.9995824
Training Recall@K:  0.17852685537424398
Training Precision@K:  0.6455358

Cross Validation AUC:  0.7101558
Cross Validation Recall@K:  0.0074578005403751115
Cross Validation Precision@K:  0.008595876



### Model 2

In [11]:
#Training the recommendation model
model_2 = train_MF_model(data_train, n_components=100, loss='warp', 
                                epoch=500, n_jobs = 8, learning_schedule = 'adadelta', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_2, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_2, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_2, open('models/model_2.pkl','wb'))

Training AUC:  0.99993265
Training Recall@K:  0.29730100909425083
Training Precision@K:  0.8859892

Cross Validation AUC:  0.6813255
Cross Validation Recall@K:  0.00208377488810257
Cross Validation Precision@K:  0.0021392454



### Model 3

In [12]:
#Training the recommendation model
model_3 = train_MF_model(data_train, n_components=100, loss='warp', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adadelta', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_3, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_3, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_3, open('models/model_3.pkl','wb'))

Training AUC:  0.99998754
Training Recall@K:  0.32349274027823655
Training Precision@K:  0.94191575

Cross Validation AUC:  0.66376823
Cross Validation Recall@K:  0.0009710452412422349
Cross Validation Precision@K:  0.0007779075



### Model 4

In [13]:
#Training the recommendation model
model_4 = train_MF_model(data_train, n_components=100, loss='logistic', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adadelta', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_4, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_4, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_4, open('models/model_4.pkl','wb'))

Training AUC:  0.86448205
Training Recall@K:  0.013351382661447683
Training Precision@K:  0.050926402

Cross Validation AUC:  0.8254915
Cross Validation Recall@K:  0.008848116832992147
Cross Validation Precision@K:  0.013691171



### Model 5

In [14]:
#Training the recommendation model
model_8 = train_MF_model(data_train, n_components=100, loss='warp', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adagrad', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_8, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_8, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_8, open('models/model_8.pkl','wb'))

Training AUC:  0.8827915
Training Recall@K:  0.021307612412536127
Training Precision@K:  0.14349806

Cross Validation AUC:  0.8412493
Cross Validation Recall@K:  0.021200752532024802
Cross Validation Precision@K:  0.036367174



### Model 6

In [15]:
#Training the recommendation model
model_6 = train_MF_model(data_train, n_components=100, loss='warp-kos', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adadelta', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_6, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_6, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_6, open('models/model_6.pkl','wb'))

Training AUC:  0.97013414
Training Recall@K:  0.32786652633241364
Training Precision@K:  0.95516974

Cross Validation AUC:  0.6499689
Cross Validation Recall@K:  0.0007130818099312848
Cross Validation Precision@K:  0.00035005834



### Model 7

In [16]:
#Training the recommendation model
model_7 = train_MF_model(data_train, n_components=100, loss='logistic', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adagrad', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_7, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_7, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_7, open('models/model_7.pkl','wb'))

Training AUC:  0.8530314
Training Recall@K:  0.0180677619825127
Training Precision@K:  0.113531284

Cross Validation AUC:  0.8377974
Cross Validation Recall@K:  0.019367296423577318
Cross Validation Precision@K:  0.032244265



### Model 8

In [18]:
#Training the recommendation model
model_5 = train_MF_model(data_train, n_components=100, loss='bpr', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adadelta', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(model_5, data_train, eval_stage = 'Training')

#Evaluation on Cross Validation Data
evaluate_model(model_5, data_val, eval_stage = 'Cross Validation')

#Save model to disk for offline usage
pkl.dump(model_5, open('models/model_5.pkl','wb'))

Training AUC:  0.9998785
Training Recall@K:  0.3289971369604419
Training Precision@K:  0.90068644

Cross Validation AUC:  0.5312512
Cross Validation Recall@K:  0.0006436677114821878
Cross Validation Precision@K:  0.0007027407



# (B) - Final Model Selection based on Cross Validation AUC


From the above experiments, I have selected the model 5 which is giving highest cross validation AUC. Now, we will use the whole train and cross validation data to build our final model, and set the hyperparameters value to the value of the best model obtained from above. We will basically refit the model with the whole training dataset that is available. We will use this model for completing our assignment task. 

NOTE: I did not use RandomizedSearchCV for hyperparamter tuning due to time constraints. I am doing the hyperparameter tuning manually.

In [17]:
sparse_X_train_interactions = sparse.load_npz('data/interaction_matrices/sparse_X_train_interactions.npz')

#Training the recommendation model on the whole dataset to create the final model (Based on Model 5's hyper-parameters)
final_trained_model = train_MF_model(sparse_X_train_interactions, n_components=100, loss='warp', 
                                epoch=1000, n_jobs = 8, learning_schedule = 'adagrad', learning_rate=0.0025)

#Evaluation on Training Data
evaluate_model(final_trained_model, sparse_X_train_interactions, eval_stage = 'Training')

#Save model to disk for offline usage
pkl.dump(final_trained_model, open('models/final_trained_model.pkl','wb'))

Training AUC:  0.87198097
Training Recall@K:  0.021025623321163815
Training Precision@K:  0.16790375



# (C) -  Assignment Tasks

## Load all the artefacts related to the final trained model

In [18]:
customer_dict_train = pkl.load(open('data/interaction_matrices/customer_dict_train.pkl','rb'))
product_dict_train = pkl.load(open('data/interaction_matrices/product_dict_train.pkl','rb'))
X_train_interactions_columns = pkl.load(open('data/interaction_matrices/X_train_interactions_columns.pkl','rb'))
X_train_interactions_index = pkl.load(open('data/interaction_matrices/X_train_interactions_index.pkl','rb'))
final_trained_model = pkl.load(open('models/final_trained_model.pkl','rb'))
sparse_X_train_interactions = sparse.load_npz('data/interaction_matrices/sparse_X_train_interactions.npz')

## 1. What are the next 3 products to be purchased by a customer?

In [19]:
def recommend_products(model, interactions, customer_id, 
                       customer_dict, product_dict, 
                       X_train_interactions_columns, X_train_interactions_index,
                       top_k_products_to_recommend=3, 
                       top_k_purchased_products=3):
    
    top_k_recommendations, top_products_purchased_before_by_this_customer = sample_recommendation_of_products_to_customer(model = model, 
                                                                                                                        sparse_interactions = interactions, 
                                                                                                                        customer_id = customer_id, 
                                                                                                                        customer_dict = customer_dict,
                                                                                                                        product_dict = product_dict, 
                                                                                                                        X_train_interactions_columns = X_train_interactions_columns,
                                                                                                                        X_train_interactions_index = X_train_interactions_index,
                                                                                                                        threshold = 0,
                                                                                                                        top_k_products_to_recommend = top_k_products_to_recommend,
                                                                                                                        top_k_purchased_products = top_k_purchased_products)
    
    print("Top {} products recommendations for customer id {}: ".format(top_k_products_to_recommend, customer_id), top_k_recommendations)
    print("Top {} most known products purchased by customer-id {} before: ".format(top_k_purchased_products, customer_id), top_products_purchased_before_by_this_customer)

#Randmoly pick up a customer ID from the holdout dataset
#customer_id = X_train_interactions['Customer_num'].sample(1).values[0]
customer_id = 'C_1005061'
model = final_trained_model
interactions = sparse_X_train_interactions
customer_dict = customer_dict_train
product_dict = product_dict_train
top_k_products_to_recommend = 3
top_k_purchased_products = 3

recommend_products(model = model, 
                    interactions = interactions, 
                    customer_id = customer_id, 
                    customer_dict = customer_dict,
                    product_dict = product_dict, 
                    X_train_interactions_columns = X_train_interactions_columns,
                    X_train_interactions_index = X_train_interactions_index,
                    top_k_products_to_recommend = top_k_products_to_recommend,
                    top_k_purchased_products = top_k_purchased_products)

Top 3 products recommendations for customer id C_1005061:  ['P_16717', 'P_7819', 'P_12777']
Top 3 most known products purchased by customer-id C_1005061 before:  ['P_9590', 'P_9300', 'P_9160']


## 2. Who are the top 5 similar customers to each customer?

In [20]:
customer_id = 'C_516469'
customer_emdedding_distance_matrix = create_customer_emdedding_distance_matrix(model, sparse_X_train_interactions, X_train_interactions_index, X_train_interactions_columns)
recommended_customers = customer_customer_recommendation(customer_emdedding_distance_matrix, customer_id, customer_dict_train, n_customers = 5, show = True)

Top 5 customer similar to the above customer: 
['C_2173177', 'C_2209071', 'C_15982', 'C_751312', 'C_807916']


## 3. Recall calculation on the basis of the shared holdout set.

NOTE : The shape of the interaction matrix of the training dataset is (9835, 17411), and the shape of the interaction matrix of the holdout dataset is (9835, 16230). For evaluation in LightFM, the above two shapes should be same. Hence I added another dummy matrix filled with zeroes to make the final shape of the holdout interaction matrix to be (9835, 17411). The dummy variables in the holdout dataset, can be intuitively thought of as new products being added, but no customer has purchased that product till now, hence the customer-product interaction will be zero.

In [21]:
#X_test_holdout is a holdout dataset, shall be only used for final evaluation and should not be part of the training methodologies
holdout_df_path = 'data/prepared_datasets_for_training_and_evaluation/reco_assignment_holdout_merged_duplicates.csv'
date_column = 'Tran_dt'
X_holdout = load_dataset(holdout_df_path, date_column)

# Creating interaction matrix for X_holdout data
X_holdout_interactions = create_interaction_matrix(df = X_holdout,
                                                    customer_col = 'Customer_num',
                                                    product_col = 'Product_num',
                                                    purchase_column = 'Total_Tran_qty')

X_holdout_interactions.shape

(9835, 16230)

In [22]:
items_train = X_train_interactions_columns
items_holdout = list(X_holdout_interactions.columns)

N = len(items_train) - len(items_holdout) #num_of_dummy_columns_to_add_to_holdout
M = X_holdout_interactions.shape[0]

dummy_columns = ["Dummy_{}".format(i) for i in range(1, N+1)]

zero_data = np.zeros(shape=(M,N))
dummy = pd.DataFrame(zero_data, columns=dummy_columns)

In [23]:
A = X_holdout_interactions.values
B = dummy.values

C = np.hstack([A,B])

X_holdout_interactions_modified = pd.DataFrame(C)
X_holdout_interactions_modified.columns = list(X_holdout_interactions.columns) + dummy_columns
X_holdout_interactions_modified.index = X_holdout_interactions.index
print("Shape of X_holdout_interactions_modified: ",X_holdout_interactions_modified.shape)
X_holdout_interactions_modified.head()

Shape of X_holdout_interactions_modified:  (9835, 17411)


Unnamed: 0_level_0,P_10,P_1000,P_10001,P_10002,P_10007,P_10010,P_10011,P_10012,P_10013,P_10014,...,Dummy_1172,Dummy_1173,Dummy_1174,Dummy_1175,Dummy_1176,Dummy_1177,Dummy_1178,Dummy_1179,Dummy_1180,Dummy_1181
Customer_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_100082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C_1001329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#AUC and Recall@K of the final trained model on the holdout dataset
evaluate_model(final_trained_model, sparse.csr_matrix(X_holdout_interactions_modified.values), eval_stage = 'Holdout Dataset')

Holdout Dataset AUC:  0.5201246
Holdout Dataset Recall@K:  0.00014755065136246246
Holdout Dataset Precision@K:  0.001084562

