In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/otto-recommender-system/sample_submission.csv
/kaggle/input/otto-recommender-system/test.jsonl
/kaggle/input/otto-recommender-system/train.jsonl


In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [3]:
chunksize = 150000
num_rows = 0

with open('/kaggle/input/otto-recommender-system/train.jsonl') as f:
    num_rows = sum(1 for line in f)
    print(f'num_rows: {num_rows}')
    print(f'number of chunks: {np.ceil(num_rows/chunksize)}')

num_rows: 12899779
number of chunks: 86.0


In [4]:
chunks = pd.read_json('/kaggle/input/otto-recommender-system/train.jsonl',lines=True,chunksize=chunksize)

In [5]:
for i,chunk in enumerate(chunks):
    if i < 86:
        event_dict = {'session':[],'aid':[],'ts':[],'type':[]}
        
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        
        train = pd.DataFrame(event_dict)
    else:
        break

train.reset_index(drop=True,inplace=True)

In [6]:
train

Unnamed: 0,session,aid,ts,type
0,12750000,485849,1661691789284,clicks
1,12750000,1772818,1661691895472,clicks
2,12750000,485849,1661691957656,clicks
3,12750001,593230,1661691789526,clicks
4,12750001,1583419,1661691981789,clicks
...,...,...,...,...
1097136,12899776,1737908,1661723987073,clicks
1097137,12899777,384045,1661723976974,clicks
1097138,12899777,384045,1661723986800,clicks
1097139,12899778,561560,1661723983611,clicks


In [7]:
train['minutes'] = train[['session','ts']].groupby('session').diff(-1) * (-1/1000/60)

In [8]:
train

Unnamed: 0,session,aid,ts,type,minutes
0,12750000,485849,1661691789284,clicks,1.769800
1,12750000,1772818,1661691895472,clicks,1.036400
2,12750000,485849,1661691957656,clicks,
3,12750001,593230,1661691789526,clicks,3.204383
4,12750001,1583419,1661691981789,clicks,
...,...,...,...,...,...
1097136,12899776,1737908,1661723987073,clicks,
1097137,12899777,384045,1661723976974,clicks,0.163767
1097138,12899777,384045,1661723986800,clicks,
1097139,12899778,561560,1661723983611,clicks,0.188750


In [9]:
temp = train.groupby(['type','aid'])['session'].agg('count').reset_index()
temp.columns = ['type','aid','count']

In [10]:
temp

Unnamed: 0,type,aid,count
0,carts,3,9
1,carts,17,1
2,carts,133,1
3,carts,204,3
4,carts,220,1
...,...,...,...
364089,orders,1855206,1
364090,orders,1855280,1
364091,orders,1855339,1
364092,orders,1855547,1


In [11]:
order_df = temp.loc[temp.type == 'orders',:]
order_df = order_df.sort_values(['count'],ascending=False).reset_index()

In [12]:
order_df

Unnamed: 0,index,type,aid,count
0,342299,orders,122983,31
1,358896,orders,1445562,21
2,359095,orders,1462420,17
3,359075,orders,1460752,17
4,359072,orders,1460571,17
...,...,...,...,...
23282,349439,orders,699033,1
23283,349438,orders,699021,1
23284,349436,orders,698928,1
23285,349434,orders,698825,1


In [13]:
order_df.aid = ' ' + order_df.aid.astype('str')
best_sold_list = order_df[:20].aid.sum()

In [14]:
best_sold_list

' 122983 1445562 1462420 1460752 1460571 1006198 1531805 1639484 1043508 332654 1125638 162064 1520039 1281615 409620 1534690 1257293 752652 1689044 876129'

In [15]:
test_df = pd.DataFrame()

chunks_test = pd.read_json('/kaggle/input/otto-recommender-system/test.jsonl',lines=True,chunksize=chunksize)

for i,chunk in enumerate(chunks_test):
    if i < 100:
        event_dict = {'session':[],'aid':[],'ts':[],'type':[]}

        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])

        chunk_session = pd.DataFrame(event_dict)
        test_df = pd.concat([test_df,chunk_session])
    else:
        break
    
test_df = test_df.reset_index(drop=True)

In [16]:
test_df

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000278,clicks
1,12899780,1142000,1661724000378,clicks
2,12899780,582732,1661724058352,clicks
3,12899780,973453,1661724109199,clicks
4,12899780,736515,1661724136868,clicks
...,...,...,...,...
6928118,14571577,1141710,1662328774770,clicks
6928119,14571578,519105,1662328775009,clicks
6928120,14571579,739876,1662328775605,clicks
6928121,14571580,202353,1662328781067,clicks


In [17]:
test_df['minutes'] = test_df[['session','ts']].groupby('session').diff(-1) * (-1/1000/60)
test_df = test_df.sort_values(by='minutes',ascending=False)

In [18]:
test_action_df = test_df.assign(aid=test_df['aid'].astype(str).radd(' ')).groupby(['session','type'])['aid'].sum().reset_index()

In [19]:
test_action_df

Unnamed: 0,session,type,aid
0,12899779,clicks,59625
1,12899780,clicks,1142000 582732 973453 736515 1142000
2,12899781,carts,199008
3,12899781,clicks,199008 194067 199008 199008 199008 199008 573...
4,12899782,carts,1494780 834354 975116 127404 413962 595994 13...
...,...,...,...
1948868,14571577,clicks,1141710
1948869,14571578,clicks,519105
1948870,14571579,clicks,739876
1948871,14571580,clicks,202353


In [20]:
next_orders_df = pd.DataFrame(test_action_df.loc[(test_action_df.type == 'carts'),:])
next_orders_df['type'] = 'orders'

In [21]:
next_orders_df

Unnamed: 0,session,type,aid
2,12899781,orders,199008
4,12899782,orders,1494780 834354 975116 127404 413962 595994 13...
10,12899786,orders,955252
12,12899787,orders,1682750 1682750 1682750
16,12899790,orders,1830166 1219653
...,...,...,...
1948716,14571430,orders,903014
1948730,14571443,orders,942326
1948774,14571486,orders,350578
1948788,14571499,orders,1132907


In [22]:
next_carts_df = pd.DataFrame(test_action_df.loc[(test_action_df.type == 'clicks'),:])
next_carts_df['type'] = 'carts'

In [23]:
next_carts_df

Unnamed: 0,session,type,aid
0,12899779,carts,59625
1,12899780,carts,1142000 582732 973453 736515 1142000
3,12899781,carts,199008 194067 199008 199008 199008 199008 573...
5,12899782,carts,603159 779477 1299062 602722 413962 975116 16...
7,12899783,carts,607638 1729553 255297 300127 1754419 1216820 ...
...,...,...,...
1948868,14571577,carts,1141710
1948869,14571578,carts,519105
1948870,14571579,carts,739876
1948871,14571580,carts,202353


In [24]:
next_clicks_df = pd.DataFrame(test_action_df.loc[(test_action_df.type == 'clicks'),:]).copy()

In [25]:
next_clicks_df

Unnamed: 0,session,type,aid
0,12899779,clicks,59625
1,12899780,clicks,1142000 582732 973453 736515 1142000
3,12899781,clicks,199008 194067 199008 199008 199008 199008 573...
5,12899782,clicks,603159 779477 1299062 602722 413962 975116 16...
7,12899783,clicks,607638 1729553 255297 300127 1754419 1216820 ...
...,...,...,...
1948868,14571577,clicks,1141710
1948869,14571578,clicks,519105
1948870,14571579,clicks,739876
1948871,14571580,clicks,202353


In [26]:
next_orders_df = pd.merge(next_orders_df,next_clicks_df[['session','aid']],on='session',how='left')

In [27]:
next_orders_df['aid'] = next_orders_df['aid_x'] + next_orders_df['aid_y']
next_orders_df = next_orders_df.drop(['aid_x','aid_y'],axis=1)

In [28]:
recommend_df = pd.concat([next_carts_df,next_clicks_df,next_orders_df],axis=0)
recommend_df['session_type'] = recommend_df['session'].astype('str') + '_' + recommend_df['type'].astype('str')
recommend_df['aid'] = recommend_df['aid'] + best_sold_list
recommend_df['aid'].fillna(best_sold_list,inplace=True)
recommend_df['aid'] = recommend_df['aid'].str.strip()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recommend_df['aid'].fillna(best_sold_list,inplace=True)


In [29]:
recommend_df

Unnamed: 0,session,type,aid,session_type
0,12899779,carts,59625 122983 1445562 1462420 1460752 1460571 1...,12899779_carts
1,12899780,carts,1142000 582732 973453 736515 1142000 122983 14...,12899780_carts
3,12899781,carts,199008 194067 199008 199008 199008 199008 5731...,12899781_carts
5,12899782,carts,603159 779477 1299062 602722 413962 975116 166...,12899782_carts
7,12899783,carts,607638 1729553 255297 300127 1754419 1216820 3...,12899783_carts
...,...,...,...,...
242828,14571430,orders,903014 903014 1162324 122983 1445562 1462420 1...,14571430_orders
242829,14571443,orders,942326 1407032 942326 568535 122983 1445562 14...,14571443_orders
242830,14571486,orders,350578 350578 350578 122983 1445562 1462420 14...,14571486_orders
242831,14571499,orders,1132907 1132907 122983 1445562 1462420 1460752...,14571499_orders


In [30]:
final_df = recommend_df[['session_type','aid']]
final_df = final_df.rename(columns={'aid':'labels'})

In [31]:
final_df

Unnamed: 0,session_type,labels
0,12899779_carts,59625 122983 1445562 1462420 1460752 1460571 1...
1,12899780_carts,1142000 582732 973453 736515 1142000 122983 14...
3,12899781_carts,199008 194067 199008 199008 199008 199008 5731...
5,12899782_carts,603159 779477 1299062 602722 413962 975116 166...
7,12899783_carts,607638 1729553 255297 300127 1754419 1216820 3...
...,...,...
242828,14571430_orders,903014 903014 1162324 122983 1445562 1462420 1...
242829,14571443_orders,942326 1407032 942326 568535 122983 1445562 14...
242830,14571486_orders,350578 350578 350578 122983 1445562 1462420 14...
242831,14571499_orders,1132907 1132907 122983 1445562 1462420 1460752...


In [32]:
final_df.to_csv('submission.csv',index=False)