In [1]:
from functools import partial
from multiprocessing import Pool

import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, precision_score, recall_score

# Reading data

Demographics and category data (`semrush_cup_categories_and_demo.csv`) are read but not used in the basic solution.

In [2]:
train = pd.read_csv('semrush_cup_train_data.csv')
val = pd.read_csv('semrush_cup_test_data.csv')
val['is_referrer'] = False

cats_and_demo = pd.read_csv('semrush_cup_categories_and_demo.csv')

In [3]:
uids_list = train['device_id'].unique()
stat_uids, train_uids = train_test_split(uids_list, train_size=0.7, random_state=0)
train_uids, test_uids = train_test_split(train_uids, train_size=0.7, random_state=0)

stat_data = train.loc[train['device_id'].isin(stat_uids)]
train_data = train.loc[train['device_id'].isin(train_uids)]
test_data = train.loc[train['device_id'].isin(test_uids)]

# Сalculation of statistics

Here we calculate statistics for traffic sources.  
Most of the data were used for the calculation, as this information will be the basis for generating the basic solution features.

In [4]:
pair_devices_count = stat_data.groupby(['domain', 'referrer_domain'])['device_id'].nunique().to_dict()
is_referrer_devices_count = stat_data.groupby('referrer_domain')['device_id'].nunique().to_dict()
is_referrer_count = stat_data.groupby('referrer_domain')['domain'].nunique().to_dict()

# Creating features

Based on statistics, we generate features: 
- `is_referrer_count` – For how many domains the domain from the previous event acted as a referral for.
- `is_referrer_devices_count` –  How popular the domain is as a referral by different devices.
- `pair_devices_count` – How often the pair met as a domain referral.
- `is_referrer_prob` – The "probability" of the domain being a referral for the target event.

Other features:
- `delta_time` – The time between two events.




In [5]:
def create_features(event_group, pair_devices_count, is_referrer_devices_count, is_referrer_count):
    event_group = event_group.sort_values('timestamp')
    target_row = event_group.loc[event_group['referrer_num'] == 'target']

    domain = target_row['domain'].values[0]

    features = event_group[['device_id', 'event_group_id', 'referrer_num', 'is_referrer']].copy()
    features['delta_time'] = target_row['timestamp'].values[0] - event_group['timestamp']

    features['is_referrer_count'] = event_group['domain'].apply(lambda x: is_referrer_count.get(x, 0))
    features['is_referrer_devices_count'] = event_group['domain'].apply(lambda x: is_referrer_devices_count.get(x, 0))
    features['pair_devices_count'] = event_group['domain'].apply(lambda x: pair_devices_count.get((domain,x), 0))
    features['is_referrer_prob'] = features['pair_devices_count']/features['is_referrer_devices_count']
 
    # Deleting the event the referral is predicted for.
    features = features.loc[features['referrer_num'] != 'target']

    return features.reset_index(drop=True)

func = partial(create_features, 
               pair_devices_count=pair_devices_count, 
               is_referrer_devices_count=is_referrer_devices_count, 
               is_referrer_count=is_referrer_count)

In [6]:
%%time

with Pool(32) as pool:
    args = [g for _, g in train_data.groupby(['device_id', 'event_group_id'])]
    train_features = pd.concat(pool.map(func, args))
    
    args = [g for _, g in test_data.groupby(['device_id', 'event_group_id'])]
    test_features = pd.concat(pool.map(func, args))
    
    args = [g for _, g in val.groupby(['device_id', 'event_group_id'])]
    val_features = pd.concat(pool.map(func, args))
    
feature_names = train_features.columns[4::]

CPU times: user 22min 4s, sys: 2min 7s, total: 24min 11s
Wall time: 23min 44s


# Model training


Before the training, we balance data to get the same number of referrals from each position (0 to 9).  
The closer two events are, the more likely there was a referral.

In [7]:
clf = xgb.XGBClassifier(max_depth=3, n_estimators=111,  
                        use_label_encoder=False, random_state=0, nthread=32)

min_size = train_features.groupby('is_referrer').size().min()
stupidly_balanced = train_features.groupby('is_referrer').apply(lambda x: x.sample(min_size))
train_X = stupidly_balanced[feature_names]
train_y = stupidly_balanced['is_referrer']

clf = clf.fit(train_X, train_y)

print('train f1:', f1_score(train_y, clf.predict(train_X)))

train f1: 0.8436456831136696


# Testing

Since the model was trained for binary classification, we choose the answer from the maximum score among the 10 preceding events.

In [8]:
predict = test_features[['event_group_id', 'is_referrer']].copy()
predict['predict'] = clf.predict_proba(test_features[feature_names])[::, 1]

true_values = predict.groupby('event_group_id').apply(lambda x: x['is_referrer'].argmax()).values
predict_values = predict.groupby('event_group_id').apply(lambda x: x['predict'].argmax()).values

In [9]:
print(f1_score(true_values, predict_values, average='macro'))

0.6596251632148219


In [10]:
f1 = np.round(f1_score(true_values, predict_values, average=None), 2)
precision = np.round(precision_score(true_values, predict_values, average=None), 2)
recall = np.round(recall_score(true_values, predict_values, average=None), 2)
for f, p, r in zip(f1, precision, recall):
    print('f1/precision/recall:', f, p, r)

f1/precision/recall: 0.59 0.6 0.59
f1/precision/recall: 0.6 0.61 0.59
f1/precision/recall: 0.62 0.63 0.6
f1/precision/recall: 0.61 0.61 0.6
f1/precision/recall: 0.62 0.63 0.61
f1/precision/recall: 0.64 0.64 0.64
f1/precision/recall: 0.65 0.66 0.65
f1/precision/recall: 0.68 0.67 0.69
f1/precision/recall: 0.72 0.7 0.74
f1/precision/recall: 0.87 0.87 0.86


# Submit

In [11]:
ans = val_features[['event_group_id']].copy()
ans['predict'] = clf.predict_proba(val_features[feature_names])[::, 1]

ans = ans.groupby('event_group_id').apply(lambda x: x['predict'].argmax())
ans = ans.reset_index()
ans = ans.rename(columns={0:'referrer_num'})