In [6]:
import pandas as pd, numpy as np

import glob


import warnings
warnings.filterwarnings("ignore")

from datetime import datetime

from tqdm import tqdm

type_labels = {'clicks':1, 'carts':2, 'orders':3}

In [13]:
def load(which):    
    dfs = []

    test_files = glob.glob('C:/Users/mrun7/Downloads/kaggle/'+which+'_parquet/*')
    
    for e, chunk_file in enumerate(test_files):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
        
    return pd.concat(dfs).reset_index(drop=True)

In [14]:
test_df = load('test')
train_df = load('train')

In [15]:
train_df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,1
1,0,1563459,1659304904,1
2,0,1309446,1659367439,1
3,0,16246,1659367719,1
4,0,1781822,1659367871,1
...,...,...,...,...
216716091,12899776,1737908,1661723987,1
216716092,12899777,384045,1661723976,1
216716093,12899777,384045,1661723986,1
216716094,12899778,561560,1661723983,1


In [16]:
datetime.strptime("2022-08-25 00:00:00", "%Y-%m-%d %H:%M:%S").timestamp()

1661410800.0

In [17]:
sum(train_df['ts']>=1661385600)

31007748

In [18]:
train_df = train_df[train_df['ts']>=1661385600].reset_index(drop=True)

In [19]:
train_df['datetime'] = train_df['ts'].apply(lambda x: datetime.fromtimestamp(x))

In [20]:
train_df.shape

(31007748, 5)

In [21]:
train_df['datetime'].dt.day.unique()

array([25, 26, 27, 28, 24], dtype=int32)

In [22]:
train_df['day'] = train_df['datetime'].dt.day

In [23]:
df_val_offline = train_df[train_df['day']==28].reset_index(drop=True)
df_train_offline = train_df[train_df['day']<28].reset_index(drop=True)

In [24]:
aid_pop_sta = df_train_offline.groupby(['aid','type'],as_index=False)['session'].count()

In [25]:
aid_pop_sta.columns = ['aid','type','pop']

In [26]:
aid_pop_sta

Unnamed: 0,aid,type,pop
0,0,1,5
1,1,1,2
2,2,1,2
3,3,1,582
4,3,2,59
...,...,...,...
1845735,1855598,1,1
1845736,1855599,1,1
1845737,1855600,1,6
1845738,1855600,2,1


In [32]:
aid_pop_sta = aid_pop_sta.sort_values('pop', ascending=False).reset_index(drop=True)

In [33]:
aid_pop_sta

Unnamed: 0,aid,type,pop
0,876493,1,22747
1,171982,1,15032
2,1460571,1,14825
3,108125,1,13706
4,373490,1,11622
...,...,...,...
1845735,292803,1,1
1845736,292802,1,1
1845737,292798,1,1
1845738,1242437,1,1


In [34]:
top200_aid_click = list(aid_pop_sta.loc[aid_pop_sta['type']==1,'aid'][:200])
top200_aid_cart = list(aid_pop_sta.loc[aid_pop_sta['type']==2,'aid'][:200])
top200_aid_order = list(aid_pop_sta.loc[aid_pop_sta['type']==3,'aid'][:200])

In [35]:
target_users = list( df_val_offline['session'].unique() )

In [36]:
len(target_users)

1260204

In [37]:
session_his = df_train_offline.loc[df_train_offline['session'].isin(target_users)].groupby(['session','type'],as_index=False)['aid'].agg(list)
session_his

Unnamed: 0,session,type,aid
0,0,1,"[1436439, 102416, 190818, 1157411, 138431, 543..."
1,0,2,"[275288, 974651, 543308, 543308, 442293, 15496..."
2,0,3,"[1199474, 543308]"
3,1,1,[1464360]
4,13,1,[1670763]
...,...,...,...
743392,12648846,1,[645240]
743393,12648847,1,[1157604]
743394,12648848,1,[1100860]
743395,12648849,1,[360363]


In [38]:
session_his['session'].nunique()

534174

In [39]:
his_click_dict = dict(zip(session_his.loc[session_his['type']==1,'session'],session_his.loc[session_his['type']==1,'aid']))
his_cart_dict = dict(zip(session_his.loc[session_his['type']==2,'session'],session_his.loc[session_his['type']==2,'aid']))
his_order_dict = dict(zip(session_his.loc[session_his['type']==3,'session'],session_his.loc[session_his['type']==3,'aid']))

In [61]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, csr_matrix

row, col = df_train_offline['session'].values, df_train_offline['aid'].values
data = np.ones(len(df_train_offline))

# Construct sparse matrix (COO format) and convert to CSR for efficient operations
interaction_matrix = coo_matrix((data, (row, col)))
sparse_matrix = interaction_matrix.tocsr()

# Step 2: Compute Cosine Similarity for Aids (Items)
item_similarity = cosine_similarity(sparse_matrix.T, dense_output=False)

# Map aid indices to IDs
unique_aids = np.unique(df_train_offline['aid'])
aid_to_index = {aid: idx for idx, aid in enumerate(unique_aids)}
index_to_aid = {idx: aid for aid, idx in aid_to_index.items()}

In [65]:
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

def collaborative_recommendations(aid_history, top_n=20):
    # Ensure dimensions match
    num_aids = item_similarity.shape[0]
    related_scores = np.zeros(num_aids)

    for aid in aid_history:
        if aid in aid_to_index:
            aid_idx = aid_to_index[aid]
            
            # Ensure aid_idx is within bounds of item_similarity
            if aid_idx < num_aids:
                # Access sparse similarity row
                similarity_row = item_similarity[aid_idx]
                # Use sparse addition
                related_scores += similarity_row.toarray().ravel()

    # Rank recommendations
    recommended_indices = np.argsort(-related_scores)[:top_n]
    return [index_to_aid[idx] for idx in recommended_indices if idx in index_to_aid]

# Generate recommendations with Jaccard similarity for orders
res = []

for each_session in tqdm(df_val_offline['session'].unique()):
    # Clicks
    if each_session in his_click_dict:
        his_click_tmp = set(his_click_dict[each_session])
        res.append([str(each_session) + '_clicks'] +
                   [' '.join([str(x) for x in top200_aid_click if x not in his_click_tmp][:20])])
    else:
        res.append([str(each_session) + '_clicks'] +
                   [' '.join([str(x) for x in top200_aid_click][:20])])
    
    # Carts
    if each_session in his_cart_dict:
        his_cart_tmp = set(his_cart_dict[each_session])
        res.append([str(each_session) + '_carts'] +
                   [' '.join([str(x) for x in top200_aid_cart if x not in his_cart_tmp][:20])])
    else:
        res.append([str(each_session) + '_carts'] +
                   [' '.join([str(x) for x in top200_aid_cart][:20])])
    
    # Orders with collaborative filtering and Jaccard similarity
    if each_session in his_order_dict:
        his_order_tmp = set(his_order_dict[each_session])
        jaccard_scores = [(aid, jaccard_similarity(his_order_tmp, {aid})) for aid in top200_aid_order]
        jaccard_sorted = [aid for aid, score in sorted(jaccard_scores, key=lambda x: x[1], reverse=True) if aid not in his_order_tmp]
        
        # Merge with collaborative filtering recommendations
        collaborative_rec = collaborative_recommendations(his_order_tmp, top_n=20)
        combined_recs = list(set(jaccard_sorted[:10] + collaborative_rec[:10]))
        res.append([str(each_session) + '_orders'] + [' '.join(map(str, combined_recs))])
    else:
        res.append([str(each_session) + '_orders'] +
                   [' '.join([str(x) for x in top200_aid_order][:20])])

100%|██████████| 1260204/1260204 [2:38:06<00:00, 132.84it/s]   


In [66]:
df_res = pd.DataFrame(res)
df_res

Unnamed: 0,0,1
0,0_clicks,876493 1460571 108125 373490 184976 1116095 15...
1,0_carts,876493 122983 166037 554660 1531805 373490 171...
2,0_orders,667791 166037 1089436 1531805 13344 923948 995...
3,1_clicks,876493 171982 1460571 108125 373490 184976 111...
4,1_carts,876493 122983 166037 554660 1531805 373490 171...
...,...,...
3780607,12899777_carts,876493 122983 166037 554660 1531805 373490 171...
3780608,12899777_orders,876493 122983 171982 373490 1531805 1445562 16...
3780609,12899778_clicks,876493 171982 1460571 108125 373490 184976 111...
3780610,12899778_carts,876493 122983 166037 554660 1531805 373490 171...


In [67]:
res_dict = dict(zip(df_res[0],df_res[1]))

In [68]:
session_items_val = df_val_offline.groupby(['session','type'],as_index=False)['aid'].agg(set)

In [69]:
session_items_val 

Unnamed: 0,session,type,aid
0,0,1,"{843110, 543308, 1228848, 161938, 219925, 9380..."
1,1,1,"{207905, 497868, 376932, 1628317}"
2,2,1,"{808782, 690631, 485582, 1605711, 526287, 4653..."
3,2,2,{161269}
4,13,1,{22195}
...,...,...,...
1604121,12899774,1,"{33035, 1399483}"
1604122,12899775,1,"{1760714, 1743151}"
1604123,12899776,1,"{1737908, 548599}"
1604124,12899777,1,{384045}


In [70]:
def getScore(df, type_, suffix):
    df_tmp = df[df['type']==type_].reset_index(drop=True)
    
    score_Numerator = 0
    score_Denominator = 0

    for i,row in tqdm( df_tmp.iterrows(),total=len(df_tmp) ):
        recom_tmp = [int(x) for x in res_dict[str(row['session'])+suffix].split(' ')]
        score_Numerator += len(row['aid'] & set(recom_tmp))
        score_Denominator += min(20,len(row['aid']))
        
    return score_Numerator/score_Denominator

In [71]:
score_click = getScore(session_items_val,1,'_clicks')

100%|██████████| 1257089/1257089 [00:43<00:00, 28623.93it/s]


In [72]:
score_cart = getScore(session_items_val,2,'_carts')

100%|██████████| 253412/253412 [00:08<00:00, 28898.11it/s]


In [73]:
score_order = getScore(session_items_val,3,'_orders')

100%|██████████| 93625/93625 [00:03<00:00, 27154.36it/s]


In [74]:
score_click * 0.1 + score_cart * 0.3 + score_order * 0.6

0.00976382719440361