В данном ноутбуке создаются простые признаки на основе популярности.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

In [3]:
import numpy as np
import popularity
import datetime as dt
import pandas as pd; pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive')

### Считывание и предобработка данных

In [5]:
user_actions = pd.read_parquet('../data/user_actions_august.parquet')

user_actions['clientid'] = user_actions['clientid'].astype('uint32')
user_actions['itemid'] = user_actions['itemid'].astype('uint32')
user_actions['action_type'] = user_actions['action_type'].map({'view': 0, 'to_cart': 1}).astype('bool')
user_actions['timestamp'] = user_actions['timestamp'].dt.date.apply(dt.datetime.toordinal).astype('uint32')

print(user_actions.shape)
user_actions.head()

(57585220, 4)


Unnamed: 0,clientid,itemid,action_type,timestamp
0,179153,135391270,False,737277
1,179153,135391272,False,737277
2,179153,135391270,False,737279
3,179153,142132354,False,737291
4,179153,140487634,False,737294


In [6]:
train_target = pd.read_csv("../data/target_train.csv")

train_target['clientid'] = train_target['clientid'].astype('uint32')
train_target['itemid'] = train_target['itemid'].astype('uint32')
train_target['jointitemid'] = train_target['jointitemid'].astype('uint32')
train_target['label'] = train_target['label'].astype('int8')

print(train_target.shape)

(1390438, 5)


In [7]:
test_target = pd.read_csv("../data/target_test.csv")

test_target['clientid'] = test_target['clientid'].astype('uint32')
test_target['itemid'] = test_target['itemid'].astype('uint32')
test_target['jointitemid'] = test_target['jointitemid'].astype('uint32')
test_target['label'] = test_target['label'].astype('int8')

print(test_target.shape)

(597158, 5)


### Генерация признаков

In [8]:
def generate_features(user_actions: pd.DataFrame,
                      target: pd.DataFrame) -> pd.DataFrame:
    '''
    Функция генерации признаков.
    
    Args:
        data_action: Датафрейм с данными за август.
        target: Датафрейм содержащий таргет.
    '''
    
    res = popularity.popularity(user_actions, target, 'view')
    res = popularity.popularity(user_actions, res, 'to_cart')
    res = popularity.ctr(user_actions, res)
    res = popularity.date_first_view(user_actions, res)
    res = popularity.mean_amount_per_day(user_actions, res, 'view')
    res = popularity.mean_amount_per_day(user_actions, res, 'to_cart')
    res = popularity.views_last_day(user_actions, res)
    res = popularity.cart_add_last_day(user_actions, res)
    res = popularity.relations(res)
    
    return res

In [9]:
train = generate_features(user_actions, train_target)
print(train.shape)
train.head(2)

(1390438, 15)


Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,mean_amount_per_day_view,mean_amount_per_day_to_cart,last_day_views_cnt,last_day_to_cart_cnt,relation_ldv_mean,relation_ldtocart_mean
0,7833842,31499843,138176581,1,2019-09-07 20:11:01,31.0,9.0,0.28125,737272.0,1.722222,1.5,0.0,0.0,0.0,0.0
1,19548158,147389610,148381589,0,2019-08-31 22:32:31,24.0,9.0,0.36,737272.0,1.6,1.125,0.0,0.0,0.0,0.0


In [10]:
test = generate_features(user_actions, test_target)
print(test.shape)
test.head(2)

(597158, 15)


Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,mean_amount_per_day_view,mean_amount_per_day_to_cart,last_day_views_cnt,last_day_to_cart_cnt,relation_ldv_mean,relation_ldtocart_mean
0,8081929,152898248,152875664,0,2019-09-06 18:57:23,10.0,2.0,0.181818,737276.0,2.0,1.0,0.0,0.0,0.0,0.0
1,33378638,144847078,140715321,0,2019-09-02 07:09:31,211.0,19.0,0.089623,737272.0,7.814815,1.727273,11.0,1.0,1.407583,0.578947


### МНК

In [11]:
item_day_view = popularity.daily_views_to_cart(user_actions, 'view')
item_day_to_cart = popularity.daily_views_to_cart(user_actions, 'to_cart')

In [12]:
mnk_view = popularity.mnk_coef(item_day_view, 'view')
mnk_to_cart = popularity.mnk_coef(item_day_to_cart, 'to_cart')

In [15]:
print(train.shape)
train = (
    train
    .merge(mnk_view.rename({'itemid': 'jointitemid'}, axis=1),
           on='jointitemid', how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

train = (
    train
    .merge(mnk_to_cart.rename({'itemid': 'jointitemid'}, axis=1),
           on='jointitemid', how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

train = train.fillna(0)
print(train.shape)

(1390438, 15)
(1390438, 17)


In [16]:
print(test.shape)
test = (
    test
    .merge(mnk_view.rename({'itemid': 'jointitemid'}, axis=1),
           on='jointitemid', how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

test = (
    test
    .merge(mnk_to_cart.rename({'itemid': 'jointitemid'}, axis=1),
           on='jointitemid', how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

test = test.fillna(0)
print(test.shape)

(597158, 15)
(597158, 17)


### Сохраняем признаки

In [21]:
train.to_csv('../data/train_with_features.csv', index=False)
test.to_csv('../data/test_with_features.csv', index=False)