In [1]:
!pip install cudf



In [2]:
import numpy as np
import pandas as pd 
from datetime import datetime, timedelta
import gc

import cudf

In [3]:
N = 12

In [4]:
df  = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',
                            usecols= ['t_dat', 'customer_id', 'article_id'], 
                            dtype={'article_id': 'int32', 't_dat': 'string', 'customer_id': 'string'})
df ['customer_id'] = df ['customer_id'].str[-16:].str.hex_to_int().astype('int64')

df['t_dat'] = cudf.to_datetime(df['t_dat'])
last_ts = df['t_dat'].max()

In [5]:
%%time
tmp = df[['t_dat']].copy().to_pandas()
tmp['dow'] = tmp['t_dat'].dt.dayofweek
tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
tmp.loc[tmp['dow'] >=2 , 'ldbw'] = tmp.loc[tmp['dow'] >=2 , 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D')

df['ldbw'] = tmp['ldbw'].values

CPU times: user 6.44 s, sys: 1.58 s, total: 8.02 s
Wall time: 7.91 s


In [6]:
tmp['dow'][:5]

0    3
1    3
2    3
3    3
4    3
Name: dow, dtype: int64

In [7]:
tmp['ldbw'][:5]

0   2018-09-25
1   2018-09-25
2   2018-09-25
3   2018-09-25
4   2018-09-25
Name: ldbw, dtype: datetime64[ns]

In [8]:
pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D')

TimedeltaIndex(['7 days', '7 days', '7 days', '7 days', '7 days', '7 days',
                '7 days', '7 days', '7 days', '7 days',
                ...
                '7 days', '7 days', '7 days', '7 days', '7 days', '7 days',
                '7 days', '7 days', '7 days', '7 days'],
               dtype='timedelta64[ns]', length=23535640, freq=None)

In [9]:
weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count().reset_index()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
weekly_sales.head()

Unnamed: 0,ldbw,article_id,count
0,2018-12-18,568652020,1
1,2019-05-14,560222012,5
2,2019-08-20,746260001,4
3,2020-04-28,831644001,28
4,2019-02-12,693614004,3


In [10]:
df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how = 'left')
df.head()

Unnamed: 0,t_dat,customer_id,article_id,ldbw,count
0,2018-09-20,-2324921015060372460,664319005,2018-09-25,27
1,2018-09-20,-2793344822301059978,562252010,2018-09-25,117
2,2018-09-20,9018061579597113584,670233004,2018-09-25,9
3,2018-09-20,5375564477674474076,464297007,2018-09-25,59
4,2018-09-20,-2324921015060372460,626316003,2018-09-25,5


In [11]:
weekly_sales = weekly_sales.reset_index().set_index('article_id')

df = df.merge(
    weekly_sales.loc[weekly_sales['ldbw']==last_ts, ['count']],
    on='article_id', suffixes=("", "_targ"))

df['count_targ'].fillna(0, inplace=True)
df.head()

Unnamed: 0,t_dat,customer_id,article_id,ldbw,count,count_targ
0,2018-09-20,2946259474250443951,537119003,2018-09-25,43,3
1,2018-09-20,-6688571413816940867,599502001,2018-09-25,197,3
2,2018-09-20,719197904754536089,493810014,2018-09-25,15,1
3,2018-09-20,-1648628336416815094,554598003,2018-09-25,56,46
4,2018-09-20,716806085948807908,688873001,2018-09-25,265,1


In [12]:
del weekly_sales

In [13]:
df['quotient'] = df['count_targ'] / df['count']

In [14]:
target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.to_pandas().tolist()
general_pred = ['0' + str(article_id) for article_id in general_pred]
general_pred_str =  ' '.join(general_pred)
del target_sales

In [15]:
general_pred 

['0448509014',
 '0573085028',
 '0751471001',
 '0706016001',
 '0673677002',
 '0715624001',
 '0706016003',
 '0158340001',
 '0579541001',
 '0372860001',
 '0372860002',
 '0706016002']

In [16]:
general_pred_str

'0448509014 0573085028 0751471001 0706016001 0673677002 0715624001 0706016003 0158340001 0579541001 0372860001 0372860002 0706016002'

In [17]:
%%time
purchase_dict = {}

tmp = df.copy().to_pandas()
tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1, 'D')).astype(int)
tmp['dummy_1'] = 1 
tmp['x'] = tmp[["x", "dummy_1"]].max(axis=1)

a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c*tmp['x']) - d

tmp['dummy_0'] = 0 
tmp['y'] = tmp[["y", "dummy_0"]].max(axis=1)
tmp['value'] = tmp['quotient'] * tmp['y'] 

tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
tmp = tmp.reset_index()

tmp = tmp.loc[tmp['value'] > 100]
tmp['rank'] = tmp.groupby("customer_id")["value"].rank("dense", ascending=False)
tmp = tmp.loc[tmp['rank'] <= 12]

# for customer_id in tmp['customer_id'].unique():
#     purchase_dict[customer_id] = {} 

# for customer_id, article_id, value in zip(tmp['customer_id'], tmp['article_id'], tmp['value']):
#     purchase_dict[customer_id][article_id] = value

purchase_df = tmp.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
purchase_df['prediction'] = purchase_df['prediction'].str.strip()
purchase_df = cudf.DataFrame(purchase_df)

CPU times: user 17.9 s, sys: 3.5 s, total: 21.4 s
Wall time: 21.3 s


In [18]:
%%time
sub  = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv',
                            usecols= ['customer_id'], 
                            dtype={'customer_id': 'string'})

sub['customer_id2'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')

sub = sub.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left',
               suffixes = ('', '_ignored'))

sub = sub.to_pandas()
sub['prediction'] = sub['prediction'].fillna(general_pred_str)
sub['prediction'] = sub['prediction'] + ' ' +  general_pred_str
sub['prediction'] = sub['prediction'].str.strip()
sub['prediction'] = sub['prediction'].str[:131]
sub = sub[['customer_id', 'prediction']]
sub.to_csv(f'submission_weekly.csv',index=False)

CPU times: user 14.1 s, sys: 967 ms, total: 15 s
Wall time: 17.7 s
