In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
import catboost

In [2]:
train = pd.read_pickle('dataset/train-catboost.pk')
val = pd.read_pickle('dataset/val-catboost.pk')
test = pd.read_pickle('dataset/test-catboost.pk')

In [3]:
train.shape, test.shape

((29370, 32), (13184, 31))

In [4]:
train.select_dtypes(exclude='number')

Unnamed: 0_level_0,campaign_var_1_trim,campaign_var_2_trim,user_act_trim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18378,others,7,000000010
36502,6,7,000100010
34367,8,5,100000000
35466,4,5,000011000
8924,6,5,000011000
...,...,...,...
37112,9,5,000011000
30399,5,10,000110010
8517,5,3,000010000
5557,4,7,000000100


In [5]:
val.select_dtypes(exclude='number')

Unnamed: 0_level_0,campaign_var_1_trim,campaign_var_2_trim,user_act_trim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14234,9,6,100000110
39076,11,10,000010000
18900,others,8,000001010
38889,10,9,000010010
24408,6,others,100000000
...,...,...,...
30129,3,7,010010000
2024,2,3,010000000
34983,11,others,000100000
32726,10,10,010001000


In [6]:
test.select_dtypes(exclude='number')

Unnamed: 0_level_0,campaign_var_1_trim,campaign_var_2_trim,user_act_trim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
39162,2,others,100010010
39163,4,7,100000100
39164,8,7,000110000
39165,9,8,others
39166,4,5,100000000
...,...,...,...
52341,others,others,000011000
52342,others,8,100010000
52343,others,others,010001000
52344,others,10,000001010


In [7]:
cols = ['campaign_var_1_trim', 'campaign_var_2_trim', 'user_act_trim']

In [8]:
train.isna().sum()

campaign_var_1              0
campaign_var_2              0
products_purchased          0
user_activity_var_1         0
user_activity_var_2         0
user_activity_var_3         0
user_activity_var_4         0
user_activity_var_5         0
user_activity_var_6         0
user_activity_var_7         0
user_activity_var_8         0
user_activity_var_9         0
user_activity_var_10        0
user_activity_var_11        0
user_activity_var_12        0
buy                         0
campaign_var_1_trim         0
campaign_var_2_trim         0
user_act_trim               0
created_at_month            0
signup_date_presence        0
diff                        0
campaign_var_1_trim_freq    0
campaign_var_2_trim_freq    0
user_act_trim_freq          0
products_purchased_freq     0
created_at_month_freq       0
campaign_var_1_trim_tme     0
campaign_var_2_trim_tme     0
user_act_trim_tme           0
products_purchased_tme      0
created_at_month_tme        0
dtype: int64

In [9]:
target = 'buy'

In [10]:
y_train = train.pop(target)
X_train = train.copy()

In [11]:
y_val = val.pop(target)
X_val = val.copy()

In [12]:
# y_test = test.pop(target)
X_test = test.copy()

In [13]:
# X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, stratify=y, random_state=42)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [14]:
total = y_train.shape[0]
pos = y_train.sum()
neg = total-pos

class_weights = [neg/total, pos/total]
class_weights

[0.9489955737146748, 0.051004426285325165]

In [15]:
pos_weight = neg/pos
pos_weight

18.60614152202937

In [16]:
X_train.dtypes

campaign_var_1                int64
campaign_var_2                int64
products_purchased          float64
user_activity_var_1           int64
user_activity_var_2           int64
user_activity_var_3           int64
user_activity_var_4           int64
user_activity_var_5           int64
user_activity_var_6           int64
user_activity_var_7           int64
user_activity_var_8           int64
user_activity_var_9           int64
user_activity_var_10          int64
user_activity_var_11          int64
user_activity_var_12          int64
campaign_var_1_trim          object
campaign_var_2_trim          object
user_act_trim                object
created_at_month              int64
signup_date_presence          int64
diff                        float64
campaign_var_1_trim_freq    float64
campaign_var_2_trim_freq    float64
user_act_trim_freq          float64
products_purchased_freq     float64
created_at_month_freq       float64
campaign_var_1_trim_tme     float64
campaign_var_2_trim_tme     

In [17]:
categorical_var = X_train.select_dtypes(exclude='number').columns
categorical_var

Index(['campaign_var_1_trim', 'campaign_var_2_trim', 'user_act_trim'], dtype='object')

In [18]:
cat_idx = np.where(X_train.columns.isin(categorical_var))[0]
cat_idx

array([15, 16, 17])

In [19]:
model = catboost.CatBoostClassifier(class_weights=class_weights, verbose=2)
model

<catboost.core.CatBoostClassifier at 0x7f6b720bcd10>

In [20]:
model.fit(X_train, y_train, cat_features=cat_idx)

Learning rate set to 0.043624
0:	learn: 0.5733937	total: 60.8ms	remaining: 1m
2:	learn: 0.3888251	total: 86.6ms	remaining: 28.8s
4:	learn: 0.2621753	total: 113ms	remaining: 22.6s
6:	learn: 0.1780163	total: 135ms	remaining: 19.1s
8:	learn: 0.1230135	total: 154ms	remaining: 16.9s
10:	learn: 0.0879205	total: 175ms	remaining: 15.7s
12:	learn: 0.0633985	total: 196ms	remaining: 14.9s
14:	learn: 0.0480216	total: 220ms	remaining: 14.5s
16:	learn: 0.0380121	total: 246ms	remaining: 14.2s
18:	learn: 0.0304895	total: 267ms	remaining: 13.8s
20:	learn: 0.0250640	total: 285ms	remaining: 13.3s
22:	learn: 0.0211496	total: 304ms	remaining: 12.9s
24:	learn: 0.0184081	total: 324ms	remaining: 12.6s
26:	learn: 0.0165851	total: 342ms	remaining: 12.3s
28:	learn: 0.0151601	total: 361ms	remaining: 12.1s
30:	learn: 0.0140962	total: 379ms	remaining: 11.9s
32:	learn: 0.0132896	total: 396ms	remaining: 11.6s
34:	learn: 0.0126930	total: 413ms	remaining: 11.4s
36:	learn: 0.0122555	total: 436ms	remaining: 11.4s
38:	lea

318:	learn: 0.0092893	total: 3.41s	remaining: 7.28s
320:	learn: 0.0092840	total: 3.43s	remaining: 7.26s
322:	learn: 0.0092795	total: 3.46s	remaining: 7.24s
324:	learn: 0.0092789	total: 3.47s	remaining: 7.21s
326:	learn: 0.0092715	total: 3.49s	remaining: 7.19s
328:	learn: 0.0092681	total: 3.52s	remaining: 7.17s
330:	learn: 0.0092680	total: 3.54s	remaining: 7.14s
332:	learn: 0.0092674	total: 3.56s	remaining: 7.12s
334:	learn: 0.0092611	total: 3.58s	remaining: 7.1s
336:	learn: 0.0092552	total: 3.6s	remaining: 7.09s
338:	learn: 0.0092509	total: 3.63s	remaining: 7.08s
340:	learn: 0.0092435	total: 3.66s	remaining: 7.08s
342:	learn: 0.0092434	total: 3.69s	remaining: 7.07s
344:	learn: 0.0092432	total: 3.72s	remaining: 7.06s
346:	learn: 0.0092430	total: 3.74s	remaining: 7.03s
348:	learn: 0.0092374	total: 3.76s	remaining: 7.01s
350:	learn: 0.0092365	total: 3.78s	remaining: 6.98s
352:	learn: 0.0092351	total: 3.8s	remaining: 6.96s
354:	learn: 0.0092344	total: 3.82s	remaining: 6.93s
356:	learn: 0.0

636:	learn: 0.0090138	total: 6.79s	remaining: 3.87s
638:	learn: 0.0090134	total: 6.81s	remaining: 3.85s
640:	learn: 0.0090090	total: 6.83s	remaining: 3.83s
642:	learn: 0.0090085	total: 6.85s	remaining: 3.8s
644:	learn: 0.0090080	total: 6.87s	remaining: 3.78s
646:	learn: 0.0090079	total: 6.89s	remaining: 3.76s
648:	learn: 0.0090079	total: 6.91s	remaining: 3.73s
650:	learn: 0.0090057	total: 6.93s	remaining: 3.72s
652:	learn: 0.0090056	total: 6.95s	remaining: 3.69s
654:	learn: 0.0090025	total: 6.97s	remaining: 3.67s
656:	learn: 0.0089988	total: 6.99s	remaining: 3.65s
658:	learn: 0.0089987	total: 7.01s	remaining: 3.63s
660:	learn: 0.0089987	total: 7.03s	remaining: 3.6s
662:	learn: 0.0089976	total: 7.05s	remaining: 3.58s
664:	learn: 0.0089932	total: 7.07s	remaining: 3.56s
666:	learn: 0.0089931	total: 7.09s	remaining: 3.54s
668:	learn: 0.0089931	total: 7.11s	remaining: 3.52s
670:	learn: 0.0089923	total: 7.13s	remaining: 3.49s
672:	learn: 0.0089922	total: 7.14s	remaining: 3.47s
674:	learn: 0.

968:	learn: 0.0087659	total: 10.4s	remaining: 332ms
970:	learn: 0.0087646	total: 10.4s	remaining: 310ms
972:	learn: 0.0087643	total: 10.4s	remaining: 289ms
974:	learn: 0.0087601	total: 10.4s	remaining: 267ms
976:	learn: 0.0087598	total: 10.4s	remaining: 246ms
978:	learn: 0.0087598	total: 10.5s	remaining: 225ms
980:	learn: 0.0087594	total: 10.5s	remaining: 203ms
982:	learn: 0.0087594	total: 10.5s	remaining: 182ms
984:	learn: 0.0087563	total: 10.5s	remaining: 160ms
986:	learn: 0.0087563	total: 10.5s	remaining: 139ms
988:	learn: 0.0087560	total: 10.6s	remaining: 117ms
990:	learn: 0.0087560	total: 10.6s	remaining: 96ms
992:	learn: 0.0087553	total: 10.6s	remaining: 74.7ms
994:	learn: 0.0087553	total: 10.6s	remaining: 53.4ms
996:	learn: 0.0087550	total: 10.6s	remaining: 32ms
998:	learn: 0.0087536	total: 10.7s	remaining: 10.7ms
999:	learn: 0.0087535	total: 10.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f6b720bcd10>

In [21]:
feat_imp = pd.Series(model.feature_importances_, index=X_train.columns)
feat_imp.sort_values(ascending=False)

signup_date_presence        30.789326
user_activity_var_9         12.290671
diff                        12.214790
user_activity_var_11        10.697111
user_activity_var_8          6.636452
user_activity_var_2          5.905280
user_activity_var_5          4.354587
user_activity_var_4          4.085095
campaign_var_1_trim_freq     2.115251
campaign_var_1               1.396540
campaign_var_2               1.272769
user_activity_var_1          1.196992
user_act_trim_tme            0.930837
products_purchased_freq      0.720986
products_purchased           0.719498
created_at_month_freq        0.588547
campaign_var_2_trim_tme      0.533114
campaign_var_2_trim_freq     0.508565
campaign_var_2_trim          0.499274
products_purchased_tme       0.489511
created_at_month_tme         0.441156
created_at_month             0.337106
campaign_var_1_trim_tme      0.266414
user_act_trim                0.192148
user_activity_var_6          0.189647
user_act_trim_freq           0.175272
user_activit

In [22]:
feat_imp.nlargest(15)*100

signup_date_presence        3078.932620
user_activity_var_9         1229.067126
diff                        1221.478968
user_activity_var_11        1069.711133
user_activity_var_8          663.645171
user_activity_var_2          590.528030
user_activity_var_5          435.458736
user_activity_var_4          408.509492
campaign_var_1_trim_freq     211.525063
campaign_var_1               139.653988
campaign_var_2               127.276883
user_activity_var_1          119.699223
user_act_trim_tme             93.083724
products_purchased_freq       72.098603
products_purchased            71.949781
dtype: float64

# Train scoring

In [23]:
y_train_prob = model.predict_proba(X_train)
y_train_prob

array([[9.99901742e-01, 9.82577961e-05],
       [9.87027419e-01, 1.29725807e-02],
       [9.99954260e-01, 4.57402501e-05],
       ...,
       [9.99706405e-01, 2.93595365e-04],
       [9.99644545e-01, 3.55455095e-04],
       [9.79278737e-01, 2.07212631e-02]])

In [24]:
y_train_pred = model.predict(X_train)
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
metrics.f1_score(y_train, y_train_pred)

0.6666666666666666

# Validation scoring

In [26]:
y_val_prob = model.predict_proba(X_val)
y_val_prob

array([[9.98847960e-01, 1.15204045e-03],
       [9.99750996e-01, 2.49004216e-04],
       [9.98911051e-01, 1.08894932e-03],
       ...,
       [9.99945189e-01, 5.48106535e-05],
       [9.99993967e-01, 6.03348276e-06],
       [9.99981450e-01, 1.85502567e-05]])

In [27]:
y_val_pred = model.predict(X_val)
y_val_pred

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
metrics.f1_score(y_val, y_val_pred)

0.6675531914893617

# Test scoring

In [29]:
y_test_prob = model.predict_proba(X_test)
y_test_prob

array([[4.77786027e-03, 9.95222140e-01],
       [9.99347876e-01, 6.52123608e-04],
       [9.99844708e-01, 1.55291754e-04],
       ...,
       [9.99732293e-01, 2.67707069e-04],
       [9.99945975e-01, 5.40248941e-05],
       [9.99952999e-01, 4.70011284e-05]])

In [30]:
y_test_pred = model.predict(X_test)
y_test_pred

array([1, 0, 0, ..., 0, 0, 0])

In [31]:
test_pred = pd.Series(y_test_pred, index=X_test.index, name=target)
test_pred

id
39162    1
39163    0
39164    0
39165    1
39166    0
        ..
52341    0
52342    0
52343    0
52344    0
52345    0
Name: buy, Length: 13184, dtype: int64

In [32]:
test_pred.to_csv("Outputs/test-cat-v3.csv")