In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
type_weight = {0:1,
               1:6,
               2:3}
type_weight_multipliers = type_weight
CSV_VER = 4 #to_csv version

VER = 5 #matrix version
clicks_th = 50 #same to matrix version
carts_th  = 50 #same to matrix version
orders_th = 50 #same to matrix version

In [4]:
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools

# multiprocessing 
import psutil
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 8


# Validation

In [5]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test(files):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(files)):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

valid = load_test('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_parquet/*')
print('Valid data has shape',valid.shape)

Valid data has shape (7683577, 4)


In [6]:
%%time

DISK_PIECES = 4
# LOAD THREE CO-VISITATION MATRICES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

top_20_clicks = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/val/top_20_clicks_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/val/top_20_clicks_v{VER}_{k}.pqt') ) )


top_20_buys = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/val/top_15_carts_orders_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/val/top_15_carts_orders_v{VER}_{k}.pqt') ) )
    
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/val/top_15_buy2buy_v{VER}_0.pqt') )

# TOP CLICKS AND ORDERS IN TEST
top_clicks = valid.loc[valid['type']==0, 'aid'].value_counts().index.values[:clicks_th]
top_orders = valid.loc[valid['type']==2, 'aid'].value_counts().index.values[:orders_th]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

Here are size of our 3 co-visitation matrices:
1812132 1055146 1812132
CPU times: user 1min 47s, sys: 12.9 s, total: 2min
Wall time: 2min 27s


In [7]:
def df_parallelize_run(func, t_split):
    
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

In [8]:
%%time
PIECES = 5
valid_bysession_list = []
for PART in range(PIECES):
    with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-valid-test-list/valid_group_tolist_{PART}_1.pkl', 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
print(len(valid_bysession_list))

1801251
CPU times: user 12.9 s, sys: 728 ms, total: 13.6 s
Wall time: 17 s


In [9]:
def suggest_clicks(df):
    
    session = df[0]
    aids = df[1]
    types = df[2]
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=clicks_th:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(clicks_th)]
        return session, sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(clicks_th) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:clicks_th - len(unique_aids)]
    
    # USE TOP20 TEST CLICKS
    return session, result + list(top_clicks)[:clicks_th-len(result)]

In [10]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_clicks, valid_bysession_list)
val_clicks = pd.DataFrame([i for f in temp for i in f[1]], index=[f[0] for f in temp for _ in range(len(f[1]))],columns=['item'])
val_clicks.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/clicks/val_clicks{CSV_VER}.pqt')

CPU times: user 1min 31s, sys: 11 s, total: 1min 42s
Wall time: 1min 56s


In [11]:
def suggest_buys(df):
    # USE USER HISTORY AIDS AND TYPES
    session = df[0]
    aids = df[1]
    types = df[2]

    unique_aids = list(dict.fromkeys(aids[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
        # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=carts_th:
        
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(carts_th)]
        return session, sorted_aids
            
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(carts_th) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:carts_th - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return session, result + list(top_orders)[:carts_th-len(result)]

In [12]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_buys, valid_bysession_list)
val_carts = pd.DataFrame([i for f in temp for i in f[1]], index=[f[0] for f in temp for _ in range(len(f[1]))],columns=['item'])
val_carts.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/carts/val_carts{CSV_VER}.pqt')

CPU times: user 1min 34s, sys: 11.2 s, total: 1min 45s
Wall time: 2min 12s


In [13]:
%%time

val_orders = pd.DataFrame([i for f in temp for i in f[1]], index=[f[0] for f in temp for _ in range(len(f[1]))],columns=['item'])
val_orders.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/orders/val_orders{CSV_VER}.pqt')

CPU times: user 1min 1s, sys: 1.39 s, total: 1min 3s
Wall time: 1min 3s


In [14]:
del temp
_ = gc.collect()

In [15]:
# FREE MEMORY
del valid_bysession_list, val_clicks, val_carts,val_orders
del top_20_clicks, top_20_buy2buy, top_20_buys, top_clicks, top_orders, valid
_ = gc.collect()

# Test

In [16]:
test = load_test('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/original/test_parquet/*')
print('Test data has shape',test.shape)

Test data has shape (6928123, 4)


In [17]:
%%time

top_20_clicks = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/test/top_20_clicks_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/test/top_20_clicks_v{VER}_{k}.pqt') ) )

top_20_buys = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/test/top_15_carts_orders_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/test/top_15_carts_orders_v{VER}_{k}.pqt') ) )
    
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/co-visitation matrix/test/top_15_buy2buy_v{VER}_0.pqt') )

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test.loc[test['type']==0, 'aid'].value_counts().index.values[:clicks_th]
top_orders = test.loc[test['type']==2, 'aid'].value_counts().index.values[:orders_th]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166
CPU times: user 1min 52s, sys: 12 s, total: 2min 4s
Wall time: 2min 29s


In [18]:
%%time
PIECES = 5
test_bysession_list = []
for PART in range(PIECES):
    with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-valid-test-list/test_group_tolist_{PART}_1.pkl', 'rb') as f:
        test_bysession_list.extend(pickle.load(f))
print(len(test_bysession_list))

1671803
CPU times: user 13.2 s, sys: 629 ms, total: 13.8 s
Wall time: 17.4 s


In [19]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_clicks, test_bysession_list)
test_clicks = pd.DataFrame([i for f in temp for i in f[1]], index=[f[0] for f in temp for _ in range(len(f[1]))],columns=['item'])
test_clicks.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/clicks/test_clicks{CSV_VER}.pqt')

CPU times: user 1min 27s, sys: 7.34 s, total: 1min 34s
Wall time: 1min 51s


In [20]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_buys, test_bysession_list)
test_carts = pd.DataFrame([i for f in temp for i in f[1]], index=[f[0] for f in temp for _ in range(len(f[1]))],columns=['item'])
test_carts.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/carts/test_carts{CSV_VER}.pqt')

CPU times: user 1min 28s, sys: 8.74 s, total: 1min 36s
Wall time: 1min 49s


In [21]:
%%time

test_orders = pd.DataFrame([i for f in temp for i in f[1]], index=[f[0] for f in temp for _ in range(len(f[1]))],columns=['item'])
test_orders.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/orders/test_orders{CSV_VER}.pqt')

CPU times: user 55.8 s, sys: 699 ms, total: 56.5 s
Wall time: 56.6 s
