In [None]:
import numpy as np
import pandas as pd
import time
import pickle

from collections import Counter

In [None]:
train = pd.read_csv("h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
train['customer_id'] = train['customer_id'].str[-16:].apply(lambda x: int(x, 16)).astype('int64')
train['article_id'] = train.article_id.astype('int32')
train.t_dat = pd.to_datetime(train.t_dat)
train = train[['t_dat','customer_id','article_id']]

In [None]:
sub_train = train.sample(frac=0.1)

In [None]:
agg_articles = sub_train.groupby('customer_id')['article_id'].apply(list).reset_index(name='agg_articles')

In [None]:
cnt = Counter()

for _, row in agg_articles.iterrows():    
    articles = row['agg_articles']
    for i in range(0, len(articles) - 1):
        for j in range(i+1, len(articles)):
            a1, a2 = articles[i], articles[j]
            if a1 not in cnt:
                cnt[a1] = Counter()
            if a2 not in cnt:
                cnt[a2] = Counter()
            
            cnt[a1][a2] += 1
            cnt[a2][a1] += 1

In [None]:
most_common_purchase = {}

for a1, articles in cnt.items():
    li = [(i, j) for (i, j) in articles.items()]
    li = sorted(li, key=lambda x: x[1], reverse=True)
    
    li = li[: min(len(li), 7)]
    
    most_common_purchase[a1] = li
    
most_common_purchase

In the below cells, we compute the frequent item that is commonly purchased together

In [None]:
agg_total = train.groupby('customer_id')['article_id'].apply(list).reset_index(name='agg_articles')

In [None]:
cnt = Counter()
start = time.time()

for idx, row in agg_total.iterrows():
    articles = row['agg_articles']
    for i in range(0, len(articles) - 1):
        for j in range(i+1, len(articles)):
            a1, a2 = articles[i], articles[j]
            if a1 not in cnt:
                cnt[a1] = Counter()
            if a2 not in cnt:
                cnt[a2] = Counter()
            
            cnt[a1][a2] += 1
            cnt[a2][a1] += 1
    
    if idx % 10000 == 0:
        end = time.time()
        print(f"Row {idx} processed")
        print(f"Time passed {end - start}")
        print(f"Current size of the cnt {len(cnt)}")
        start = time.time()

In [36]:
import pickle

with open('most_common_purchase.pkl', 'wb') as file:
    pickle.dump(most_common_purchase_new, file)

In [None]:
most_common_purchase = {}

for a1, articles in cnt.items():
    li = [(i, j) for (i, j) in articles.items()]
    li = sorted(li, key=lambda x: x[1], reverse=True)
    
    li = li[: min(len(li), 3)]
    
    most_common_purchase[a1] = li

In the below cells, we compute the last purchased item from last week

In [None]:
sub_train = train.sample(frac=0.1)
tmp = sub_train.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id', 'max_dat']
sub_train = sub_train.merge(tmp, on=['customer_id'], how='left')
sub_train['diff_dat'] = (sub_train.max_dat - sub_train.t_dat).dt.days
sub_train = sub_train.loc[sub_train['diff_dat'] <= 6]

In [None]:
df = pd.read_csv('h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
articles = df.article_id.value_counts()

In [None]:
most_common_purchase_items = {
    key: [x[0] for x in val] for key, val in most_common_purchase.items()
}

In [None]:
sub_train['most_common_articles'] = sub_train.article_id.map(most_common_purchase_items)

In [34]:
most_common_purchase_new = {
    key: [elem for elem in val if elem[0] != key] for key, val in most_common_purchase.items()
}

In [35]:
most_common_purchase_new

{657510001: [(706016001, 398),
  (706016002, 220),
  (677711003, 208),
  (372860001, 190),
  (448509014, 182),
  (156231001, 181)],
 539723001: [(706016001, 7638),
  (539723005, 2661),
  (562245050, 2197),
  (706016002, 2163),
  (399223001, 1320),
  (539723003, 1263)],
 739373004: [(706016001, 65),
  (739373001, 50),
  (759871002, 45),
  (399223001, 44),
  (706016002, 43),
  (372860001, 41)],
 806012001: [(773955001, 328),
  (706016001, 277),
  (610776002, 178),
  (562245046, 161),
  (759871002, 155),
  (706016002, 152)],
 706016001: [(706016002, 26421),
  (706016003, 16038),
  (706016015, 12531),
  (706016006, 10746),
  (399223001, 9125),
  (554450001, 7914)],
 568597006: [(568601006, 4845),
  (695544001, 4037),
  (568597007, 1766),
  (751471001, 1635),
  (573716012, 1163),
  (524825011, 1134)],
 849493006: [(849493007, 90),
  (877224007, 76),
  (855793005, 53),
  (851802001, 48),
  (751471001, 47),
  (608945001, 44)],
 873657001: [(706016001, 26),
  (111586001, 25),
  (759871002, 24)