In [None]:
## load libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sps
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

In [None]:
# load product and sample file
products = pd.read_csv('products.csv')

In [None]:
sample = pd.read_csv('sampleSubmission.csv')

In [None]:
sample.head()

In [7]:
'BBID_20410043' in sample['customerID']

False

In [8]:

## number of unique customers
products['customerID'].nunique()

165055

In [9]:
products['transactionDate'] = pd.to_datetime(products['transactionDate'])
mask = (products['transactionDate'] >= '2016-01-01')
products = products.loc[mask]

In [10]:

products.sort_values('transactionDate',inplace=True)
products = products.reset_index(drop=True)

In [11]:
## take only those customers which are in sample submission file
products_2 = products[products['customerID'].isin(sample['customerID'])]

## remove missing values # 4
products_2 = products_2[~pd.isnull(products_2['product_code'])]

## convert type of product code
products_2['product_code'] = products_2['product_code'].astype(np.int64)

In [12]:

products_2 = products_2.loc[:,['customerID','product_code']]
products_2 = products_2.reset_index(drop=True)

In [13]:
## these customers are not in train, so we'll predict None for them at last
misfit_customers = list(set(sample['customerID']) - set(products_2['customerID']))

In [14]:
## create product list by customers
products_2 = products_2.groupby('customerID')['product_code'].apply(lambda x: x.tolist()).reset_index()

In [15]:
## remove duplicate products
products_2['product_code'] = products_2['product_code'].map(lambda x: list(set(x)))

In [16]:
## fix product max len to 20 (we'll pick the last 20 i.e most recent ones)
products_2['product_code'] = products_2['product_code'].map(lambda x: x[-20:])

In [17]:
### create a list of customers & products

customerIDs = []
product_codes = []

for index, row in products_2.iterrows():
    #if index % 10 == 0:
    #    print (index)
    ls_len = len(row['product_code'])
    customerIDs.extend(np.repeat(row['customerID'], ls_len))
    product_codes.extend(row['product_code'])

In [18]:

## encode values 

from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder() ## for customers
customerIDs = lbl.fit_transform(customerIDs)
 
lbl2 = LabelEncoder() ## for products
product_codes = lbl2.fit_transform(product_codes)

In [19]:

n_unique_users = len(set(customerIDs))
n_unique_products = len(set(product_codes))

row = customerIDs
col = product_codes

vals = np.repeat(1, len(row))

## this matrix has information about a user bought which all products.
user_product_matrix = sps.csr_matrix((vals, (row, col)), shape=(n_unique_users, n_unique_products))

## this matrix has information a product got bought along with which other products, and how many times
product_cooccurence_matrix = (user_product_matrix.T * user_product_matrix)

## set diagonal equals to zero since we are not interested in knowing the count of a product with itself
product_cooccurence_matrix.setdiag(np.repeat(0, product_cooccurence_matrix.shape[0]))

In [20]:

## create a data frame of encoded values
product_summary = pd.DataFrame({'customerID':customerIDs, 'product_code':product_codes})
product_summary = product_summary.groupby('customerID')['product_code'].agg(lambda x:x.tolist()).reset_index().rename(columns = {0:'product_collection'})

In [21]:

## how many recommendation to make
def take_top_(x):
    if x >= 20:
        return 1
    else:
        if x < 20:
            return int(np.round(20/x))
        
## get count of products per customer
product_summary['len_collection'] = product_summary['product_collection'].map(len)

## if a customer has 20 products in the list, we'll take 1 top most product per each product
product_summary['take_top'] = product_summary['len_collection'].map(lambda x: take_top_(x))

In [22]:
## recommendation function
def recommend_affinity(user):
    
    products_ = product_summary[product_summary['customerID'] == user]['product_collection'].iloc[0] # returns list
    take_top_ = product_summary[product_summary['customerID'] == user]['take_top'].iloc[0]
    
    recs = []
    
    if take_top_ == 0:
        return recs
    
    ## here we get the list of products which were bought the maximum number of times along with a particular productr
    for tt in products_:
        s = np.squeeze(np.asarray(product_cooccurence_matrix[tt].todense())) ## list of products bought with counts` with that product
        ll = s.argsort()[-int(take_top_):][::-1]
        recs.append(list(ll))

    recs = list(np.vstack(recs).flatten('F'))
 
    if not recs:
        return recs
    
    return recs

In [None]:

## recommendation for customers
unique_customers = []

for i in sample['customerID']:
    if i not in misfit_customers:
        unique_customers.append(i)

unique_customers = lbl.fit_transform(unique_customers)

In [None]:

from collections import defaultdict
out_dict = defaultdict(list)

nulls = []

for user in tqdm(unique_customers): #[:50]): ## i took first 50 customers. 
    rec = recommend_affinity(user)
    if not rec:
        nulls.append(user)
    out_dict[user] = rec
#     if i% 10 == 0:
#         print(i)

 16%|███████████▌                                                             | 4446/28192 [1:27:25<7:46:57,  1.18s/it]

In [24]:
## get unique items per customer
from collections import defaultdict

out_dict_2 = defaultdict(list)

for k,v in out_dict.items():
    out_dict_2[k] = list(set(v))


In [25]:
## convert key values to inverse customer codes

customer_codes = list(lbl.inverse_transform(customerIDs))
customer_maps = dict(zip(list(customerIDs), list(customer_codes)))

In [26]:

out_dict_3 = defaultdict(list)

for k,v in out_dict_2.items():
    out_dict_3[customer_maps[k]] = out_dict_2[k]

In [27]:
mis_dict = defaultdict(list)

for i in misfit_customers:
    mis_dict[i] = list(np.repeat('None', 20))
    

In [28]:
out_dict_3.update(mis_dict)

In [29]:
submission = pd.DataFrame(list(out_dict_3.items()), columns=['customerID','products'])

In [30]:
submission.head()

Unnamed: 0,customerID,products
0,BBID_204107517,"[6081, 14787, 40198, 438, 5815, 5816, 4184]"
1,BBID_20470035,"[6081, 2406, 2408, 2731, 5816, 5815, 2424, 2104]"
2,BBID_211419396,"[6081, 13444, 2120, 5450, 13965, 5816, 5815, 2..."
3,BBID_211411290,"[1504, 6081, 353, 2690, 2424, 13215, 11434, 13..."
4,BBID_20450592,"[None, None, None, None, None, None, None, Non..."


In [31]:
product_codes_inv = list(lbl2.inverse_transform(product_codes))
product_maps = dict(zip(list(product_codes), list(product_codes_inv)))

In [32]:
submission['products'] = submission['products'].map(lambda x: [product_maps.get(s) for s in x])

In [33]:
for index, row in submission.iterrows():
    max_len = 20
    if len(row['products']) < max_len:
        size = len(row['products'])
        diff_ = max_len - size
        a = row['products']
        a.extend(np.repeat('None', diff_))
        submission.loc[index, 'products'] = a
    else:
        submission.loc[index, 'products'] = row['products'][:20]

In [34]:
submission['products'] = submission['products'].map(lambda x: ','.join(str(e) for e in x))

In [35]:
submission.head()

Unnamed: 0,customerID,products
0,BBID_204107517,"300840018,1000681171,1000697272004,108001127,3..."
1,BBID_20470035,"300840018,108037499,108037501,108100362,300776..."
2,BBID_211419396,"300840018,1000588265,108030315,300676075,10006..."
3,BBID_211411290,"108020718,300840018,108000707,108100296,108037..."
4,BBID_20450592,"None,None,None,None,None,None,None,None,None,N..."


In [36]:
submission.to_csv('sub_0001.csv', index=False)