In [1]:
## load libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sps
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

In [2]:
# load product and sample file
products = pd.read_csv('products.csv')

In [3]:
sample = pd.read_csv('sampleSubmission.csv')


In [4]:
products['transactionDate'] = pd.to_datetime(products['transactionDate'])
mask = (products['transactionDate'] >= '2016-12-01')
products = products.loc[mask]

In [5]:
products.sort_values('transactionDate',inplace=True)
products = products.reset_index(drop=True)

In [6]:
## take only those customers which are in sample submission file
products_2 = products[products['customerID'].isin(sample['customerID'])]

## remove missing values # 4
products_2 = products_2[~pd.isnull(products_2['product_code'])]

## convert type of product code
products_2['product_code'] = products_2['product_code'].astype(np.int64)

In [8]:
products_2.head()

Unnamed: 0,customerID,DOB,Gender,State,PinCode,transactionDate,store_code,store_description,till_no,transaction_number_by_till,promo_code,promotion_description,product_code,product_description,sale_price_after_promo,discountUsed
0,BBID_20482218,1945-10-22,male,PUNJAB,141012,2016-12-01,3692,BB-LUDHIANA-FEROZEPUR-MF,7,65291,NONPROMO,,108100362,ONION LOOSE,14.13,"BBProfitClub,Payback"
1,BBID_20411446,1964-11-20,female,MADHYA PRADESH,452001,2016-12-01,2906,BB-INDORE-TREASURE ISLAND,6,31320,NONPROMO,,300255445,SAFAL FROZEN PEAS GREEN 600g,75.0,Payback
2,BBID_20411446,1964-11-20,female,MADHYA PRADESH,452001,2016-12-01,2906,BB-INDORE-TREASURE ISLAND,6,31320,NONPROMO,,1000172126,PATANJALI BALM 25GM,40.0,Payback
3,BBID_20411446,1964-11-20,female,MADHYA PRADESH,452001,2016-12-01,2906,BB-INDORE-TREASURE ISLAND,6,31320,0001215948,IZ 6 RS OFF REG Health Oil,300551592,SUNDROP SUPERLITE SUN OIL PP 1L,184.0,Payback
4,BBID_20411446,1964-11-20,female,MADHYA PRADESH,452001,2016-12-01,2906,BB-INDORE-TREASURE ISLAND,6,31320,NONPROMO,,1000446410,PATANJALI BLACK PEPPER WHOLE 100g,120.0,Payback


In [9]:
## these customers are not in train, so we'll predict None for them at last
misfit_customers = list(set(sample['customerID']) - set(products_2['customerID']))

In [10]:
## create product list by customers
products_2 = products_2.groupby('customerID')['product_code'].apply(lambda x: x.tolist()).reset_index()

In [12]:
products_2.head()## remove duplicate products
products_2['product_code'] = products_2['product_code'].map(lambda x: list(set(x)))

Unnamed: 0,customerID,product_code
0,BBID_20410000,"[300600921, 300667170, 620001258]"
1,BBID_20410002,"[1000601263, 108100134, 530000111, 1000230660,..."
2,BBID_20410006,"[300481740, 300144674, 108024362, 108032048, 1..."
3,BBID_20410007,"[1000001730, 300413862, 1000214904, 1000626523..."
4,BBID_20410012,"[1000479471003, 300989207, 1000479347004, 1000..."


In [13]:
## remove duplicate products
products_2['product_code'] = products_2['product_code'].map(lambda x: list(set(x)))

In [14]:
## fix product max len to 20 (we'll pick the last 20 i.e most recent ones)
products_2['product_code'] = products_2['product_code'].map(lambda x: x[-20:])

In [15]:
### create a list of customers & products

customerIDs = []
product_codes = []

for index, row in products_2.iterrows():
    #if index % 10 == 0:
    #    print (index)
    ls_len = len(row['product_code'])
    customerIDs.extend(np.repeat(row['customerID'], ls_len))
    product_codes.extend(row['product_code'])

In [17]:
## encode values 

from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder() ## for customers
customerIDs = lbl.fit_transform(customerIDs)
 
lbl2 = LabelEncoder() ## for products
product_codes = lbl2.fit_transform(product_codes)

In [20]:
n_unique_users = len(set(customerIDs))
n_unique_products = len(set(product_codes))

row = customerIDs
col = product_codes

vals = np.repeat(1, len(row))

## this matrix has information about a user bought which all products.
user_product_matrix = sps.csr_matrix((vals, (row, col)), shape=(n_unique_users, n_unique_products))

## this matrix has information a product got bought along with which other products, and how many times
product_cooccurence_matrix = (user_product_matrix.T * user_product_matrix)

## set diagonal equals to zero since we are not interested in knowing the count of a product with itself
product_cooccurence_matrix.setdiag(np.repeat(0, product_cooccurence_matrix.shape[0]))

In [21]:
## create a data frame of encoded values
product_summary = pd.DataFrame({'customerID':customerIDs, 'product_code':product_codes})
product_summary = product_summary.groupby('customerID')['product_code'].agg(lambda x:x.tolist()).reset_index().rename(columns = {0:'product_collection'})

In [22]:
product_summary.head()

Unnamed: 0,customerID,product_collection
0,0,"[7239, 5039, 5373]"
1,1,"[5343, 4375, 4376, 3074, 11370, 2418, 12780, 6..."
2,2,"[3366, 7470, 8830, 2559, 1655, 1656, 4559, 217..."
3,3,"[6199, 3122, 2503, 5815, 14162, 4457, 32611, 7..."
4,4,"[5815, 31095, 37838, 5816, 37150, 37883, 34691..."


In [23]:
## how many recommendation to make
def take_top_(x):
    if x >= 20:
        return 1
    else:
        if x < 20:
            return int(np.round(20/x))
        
## get count of products per customer
product_summary['len_collection'] = product_summary['product_collection'].map(len)

## if a customer has 20 products in the list, we'll take 1 top most product per each product
product_summary['take_top'] = product_summary['len_collection'].map(lambda x: take_top_(x))

In [24]:
product_summary.head()

Unnamed: 0,customerID,product_collection,len_collection,take_top
0,0,"[7239, 5039, 5373]",3,7
1,1,"[5343, 4375, 4376, 3074, 11370, 2418, 12780, 6...",20,1
2,2,"[3366, 7470, 8830, 2559, 1655, 1656, 4559, 217...",12,2
3,3,"[6199, 3122, 2503, 5815, 14162, 4457, 32611, 7...",20,1
4,4,"[5815, 31095, 37838, 5816, 37150, 37883, 34691...",16,1


In [25]:
## recommendation function
def recommend_affinity(user):
    
    products_ = product_summary[product_summary['customerID'] == user]['product_collection'].iloc[0] # returns list
    take_top_ = product_summary[product_summary['customerID'] == user]['take_top'].iloc[0]
    
    recs = []
    
    if take_top_ == 0:
        return recs
    
    ## here we get the list of products which were bought the maximum number of times along with a particular productr
    for tt in products_:
        s = np.squeeze(np.asarray(product_cooccurence_matrix[tt].todense())) ## list of products bought with counts` with that product
        ll = s.argsort()[-int(take_top_):][::-1]
        recs.append(list(ll))

    recs = list(np.vstack(recs).flatten('F'))
 
    if not recs:
        return recs
    
    return recs

In [None]:
## recommendation for customers
unique_customers = []

for i in sample['customerID']:
    if i not in misfit_customers:
        unique_customers.append(i)

unique_customers = lbl.fit_transform(unique_customers)

In [None]:
from collections import defaultdict
out_dict = defaultdict(list)

nulls = []

for user in tqdm(unique_customers): #[:50]): ## i took first 50 customers. 
    rec = recommend_affinity(user)
    if not rec:
        nulls.append(user)
    out_dict[user] = rec
#     if i% 10 == 0:
#         print(i)

  1%|▊                                                           | 354/25754 [02:39<3:37:48,  1.94it/s]

In [None]:
## get unique items per customer
from collections import defaultdict

out_dict_2 = defaultdict(list)

for k,v in out_dict.items():
    out_dict_2[k] = list(set(v))