In [19]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

In [20]:
# Загрузка данных
triggers = pd.read_csv('triggers.csv')
actions = pd.read_csv('actions.csv')



In [21]:
triggers.head()

Unnamed: 0,guid,date,trigger,type
0,018bdd75-83fc-2510-50e7-153f7b5e4b64,2024-05-31 23:55:27,2047,2
1,0187a472-becb-7994-5e1e-f179b60bc141,2024-05-31 23:55:27,168,2
2,018ba175-9e02-8c7a-e522-6e5fec6949a0,2024-05-31 23:55:26,1807,2
3,018ba175-9e02-8c7a-e522-6e5fec6949a0,2024-05-31 23:55:26,2048,2
4,018e9cfb-c8bc-f38b-e209-5a77c5324a14,2024-05-31 23:55:26,1773,2


In [22]:
actions.head()

Unnamed: 0,guid,date,result
0,0187a45c-6784-7e2f-5d84-f3c89dee6a60,2024-05-20 08:28:13,0
1,0187a45d-650b-4a4f-ea59-9432556c9b1d,2024-05-31 08:19:10,0
2,018ba1bd-3c62-0269-e77f-655655f10b3e,2024-05-13 09:01:37,0
3,018efb50-1738-e3af-58e8-0550e9a272ca,2024-05-22 05:18:46,0
4,0187a45c-7aa1-3ec6-3c68-c86cb9172299,2024-05-10 16:18:53,0


In [23]:
triggers['date'] = pd.to_datetime(triggers['date'])
actions['date'] = pd.to_datetime(actions['date'])

- Сортировка данных по столбцам 'date' и 'guid' для корректного объединения данных.
- Необходима для работы метода merge_asof, который требует отсортированных данных.


In [24]:
triggers = triggers.sort_values(['date', 'guid'], ascending=[True, True]).reset_index(drop=True)
actions = actions.sort_values(['date', 'guid'], ascending=[True, True]).reset_index(drop=True)

- Объединение данных triggers и actions по 'guid' и ближайшей дате события с использованием pd.merge_asof.
- direction='forward' находит ближайшее будущее действие после триггера.
- allow_exact_matches=True позволяет включать совпадающие даты.

In [25]:
merged = pd.merge_asof(
    triggers,
    actions,
    by='guid',
    on='date',
    direction='forward',
    suffixes=('', '_action'),
    allow_exact_matches=True
)
merged['result'] = merged['result'].fillna(0)

#### Вычисление времени (в днях) с момента последнего триггера для каждого пользователя.

In [26]:
merged['time_since_last_trigger'] = merged.groupby('guid')['date'].diff().dt.total_seconds() / (3600*24)
merged['time_since_last_trigger'] = merged['time_since_last_trigger'].fillna(0)

#### Подсчет кумулятивного количества триггеров для каждого пользователя, чтобы отразить активность пользователя.

In [27]:
merged['trigger_count'] = merged.groupby('guid').cumcount()

#### Подсчет количества предыдущих успешных взаимодействий для каждого пользователя.
#### Этот признак показывает склонность пользователя к положительным результатам.

In [28]:
merged['prev_successes'] = merged.groupby('guid')['result'].cumsum() - merged['result']

#### Срезание части строк, чтобы не перегружать процессор

In [29]:
train_merge = merged.iloc[:5000000]

## Разбиение и обучение модели

In [30]:
split_date = train_merge['date'].quantile(0.8)
train_data = train_merge[train_merge['date'] <= split_date].reset_index(drop=True)
val_data = train_merge[train_merge['date'] > split_date].reset_index(drop=True)

In [31]:
features = ['trigger', 'type', 'time_since_last_trigger', 'trigger_count', 'prev_successes']
categorical_features = ['trigger', 'type']

In [32]:
features = ['trigger', 'type', 'time_since_last_trigger', 'trigger_count', 'prev_successes']
target = 'result'

In [33]:
train_pool = Pool(train_data[features], label=train_data['result'], cat_features=categorical_features)
val_pool = Pool(val_data[features], label=val_data['result'], cat_features=categorical_features)

In [34]:
categorical_features = ['trigger', 'type']
numerical_features = ['time_since_last_trigger', 'trigger_count', 'prev_successes']


In [35]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    eval_metric='AUC',
    random_seed=42,
    use_best_model=True
)

In [36]:
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=100)

0:	test: 0.9960389	best: 0.9960389 (0)	total: 2.61s	remaining: 43m 28s
100:	test: 0.9989268	best: 0.9989268 (100)	total: 3m 16s	remaining: 29m 10s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9989590774
bestIteration = 140

Shrink model to first 141 iterations.


<catboost.core.CatBoostClassifier at 0x20a803730b0>

In [37]:
val_pred = model.predict_proba(val_pool)[:, 1]

In [38]:
roc_auc_score(val_data['result'], val_pred)

0.9989590774144912

# Создание копии данных валидации для инференса

In [61]:
new_triggers = val_data.copy()

In [62]:
last_interactions = actions.groupby('guid')['date'].max().reset_index()
last_interactions.columns = ['guid', 'last_action_date']

In [63]:
new_triggers['guid'] = new_triggers['guid'].astype(str)
last_interactions['guid'] = last_interactions['guid'].astype(str)

In [64]:
new_triggers = new_triggers.merge(last_interactions, on='guid', how='left')

In [65]:
new_triggers['time_since_last_action'] = (new_triggers['date'] - new_triggers['last_action_date']).dt.total_seconds() / (3600*24)
new_triggers['time_since_last_action'] = new_triggers['time_since_last_action'].fillna(float('inf'))

In [66]:
new_triggers = new_triggers[new_triggers['time_since_last_action'] > 14]

In [67]:
new_triggers = new_triggers.sort_values(['guid', 'date']).reset_index(drop=True)
new_triggers['time_since_last_trigger'] = new_triggers.groupby('guid')['date'].diff().dt.total_seconds() / (3600*24)
new_triggers['time_since_last_trigger'] = new_triggers['time_since_last_trigger'].fillna(float('inf'))
new_triggers['trigger_count'] = new_triggers.groupby('guid').cumcount()

In [68]:
prev_successes = merged.groupby('guid')['result'].sum().reset_index()
prev_successes.columns = ['guid', 'prev_successes']
prev_successes['guid'] = prev_successes['guid'].astype(str)

In [69]:
new_triggers = new_triggers.merge(prev_successes, on='guid', how='left')

In [70]:
new_triggers['prev_successes'] = new_triggers['prev_successes_x']
new_triggers.drop(['prev_successes_x', 'prev_successes_y'], axis=1, inplace=True)

In [71]:
inference_pool = Pool(new_triggers[features], cat_features=categorical_features)

In [72]:
new_triggers['predicted_probability'] = model.predict_proba(inference_pool)[:, 1]

In [73]:
new_triggers[['guid', 'date', 'trigger', 'type', 'predicted_probability']]

Unnamed: 0,guid,date,trigger,type,predicted_probability
0,018780a7-2870-b8c0-8fd7-d8621b6bdd65,2024-05-04 08:12:43,1811,2,0.010847
1,018780a8-f642-fb39-d2ac-4b3fb38f70e2,2024-05-04 07:57:38,243,1,0.004844
2,018780a8-f642-fb39-d2ac-4b3fb38f70e2,2024-05-04 07:57:45,243,1,0.000770
3,018780a8-f642-fb39-d2ac-4b3fb38f70e2,2024-05-04 07:57:51,243,1,0.000096
4,018783bd-1443-546f-0e9a-f4083a01db2c,2024-05-04 07:24:37,1815,2,0.010847
...,...,...,...,...,...
323179,018f42de-95da-8289-5b57-5aa27b65896b,2024-05-04 09:10:26,187,1,0.004793
323180,018f42de-9866-b6f9-e09d-217467112f23,2024-05-04 09:10:27,105,1,0.004844
323181,018f42de-9866-b6f9-e09d-217467112f23,2024-05-04 09:10:29,105,1,0.000801
323182,018f42de-9fa2-fedb-5069-5170e9cb6aff,2024-05-04 09:10:28,2096,1,0.005388
