In [1]:
!pip install optuna 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Scientific 
import numpy as np

# Machine learning

import xgboost as xgb
from sklearn.metrics import roc_auc_score,f1_score,confusion_matrix,accuracy_score

# Hyperparameters tuning
import optuna

In [3]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
train.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,1,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,0,0,1,0,0,0,0,0,0
2,3,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2021-01-01,6,7,2.0,2017-10-04,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2021-01-01,4,6,,2020-06-08,0,0,0,0,0,0,1,0,0,0,1,0,0


In [4]:
train.shape,test.shape

((39161, 19), (13184, 18))

In [5]:
train['user_activity_var_1'].unique(),train['user_activity_var_5'].unique()

(array([0, 1, 2, 3]), array([0, 1, 2]))

In [6]:
train['user_activity_var_8'].unique(),train['user_activity_var_6'].unique()

(array([1, 0, 2]), array([0, 1, 2, 3]))

In [7]:
train.columns

Index(['id', 'created_at', 'campaign_var_1', 'campaign_var_2',
       'products_purchased', 'signup_date', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12', 'buy'],
      dtype='object')

In [8]:
X =  train[['campaign_var_1', 'campaign_var_2', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12']]

In [9]:
y = train['buy']

In [10]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42,stratify=y)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    acc = f1_score(test_y, preds)
    
    return acc

In [11]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))

print('Best trial:', study.best_trial.params)

[32m[I 2022-06-05 15:16:30,988][0m A new study created in memory with name: no-name-5d29e51a-081b-4e45-8029-deea65559488[0m
[32m[I 2022-06-05 15:16:31,832][0m Trial 0 finished with value: 0.0 and parameters: {'lambda': 0.1401219180767386, 'alpha': 1.4080662224850853, 'colsample_bytree': 0.4, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 150}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-06-05 15:16:32,301][0m Trial 1 finished with value: 0.0 and parameters: {'lambda': 0.5792382461740426, 'alpha': 0.001992142525738094, 'colsample_bytree': 0.3, 'subsample': 0.7, 'learning_rate': 0.012, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 90}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-06-05 15:16:32,685][0m Trial 2 finished with value: 0.3505154639175258 and parameters: {'lambda': 0.10700644215687179, 'alpha': 1.0561568644917088, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.018, 'max_depth': 7

Number of finished trials: 100
Best trial: {'lambda': 0.4817771686936106, 'alpha': 0.01233924222140631, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.018, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 13}


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42,stratify=y)

In [13]:
model = xgb.XGBClassifier( alpha=0.01233924222140631,
                            colsample_bytree= 0.9, 
                            subsample= 1.0, 
                            learning_rate= 0.018, max_depth= 17, 
                            random_state= 2020, min_child_weight=13)
%time model.fit(X_train,y_train)

CPU times: user 5.04 s, sys: 18.9 ms, total: 5.06 s
Wall time: 5.89 s


XGBClassifier(alpha=0.01233924222140631, colsample_bytree=0.9,
              learning_rate=0.018, max_depth=17, min_child_weight=13,
              random_state=2020, subsample=1.0)

In [14]:
pred = model.predict(X_test)

In [15]:
accuracy_score(y_test, pred)

0.974466998595685

In [16]:
f1_score(y_test, pred)

0.6784565916398714

# Cat Boost Classifier

In [17]:
!pip install CatBoost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
from catboost import CatBoostClassifier

In [19]:
def objective(trial,data=X,target=y):
    train_x, valid_x, train_y, valid_y = train_test_split(X,y, test_size=0.2,random_state=42,stratify=y)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)
    return accuracy

In [20]:
study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))

print('Best trial:', study.best_trial.params)

[32m[I 2022-06-05 15:20:35,650][0m A new study created in memory with name: no-name-8a629d84-fb48-43f4-a904-e7611859c413[0m
[32m[I 2022-06-05 15:20:51,462][0m Trial 0 finished with value: 0.9745946636027065 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.04399207528468413, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9745946636027065.[0m
[32m[I 2022-06-05 15:21:16,076][0m Trial 1 finished with value: 0.9751053236307928 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.06660509682846195, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.740562594384812}. Best is trial 1 with value: 0.9751053236307928.[0m
[32m[I 2022-06-05 15:21:34,144][0m Trial 2 finished with value: 0.9749776586237712 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.0685502093695178, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample'

Number of finished trials: 100
Best trial: {'objective': 'Logloss', 'colsample_bylevel': 0.09411857312151965, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}


# CatBoost Classifier

In [21]:
gbm = CatBoostClassifier(objective='Logloss', colsample_bylevel=0.09411857312151965,depth=12,
                         boosting_type='Plain',bootstrap_type='MVS')

gbm.fit(X_train,y_train)

Learning rate set to 0.044843
0:	learn: 0.6359696	total: 4.86ms	remaining: 4.86s
1:	learn: 0.5858363	total: 21ms	remaining: 10.5s
2:	learn: 0.5340733	total: 30.9ms	remaining: 10.3s
3:	learn: 0.4952985	total: 42.2ms	remaining: 10.5s
4:	learn: 0.4549613	total: 45.9ms	remaining: 9.13s
5:	learn: 0.4172103	total: 59.8ms	remaining: 9.91s
6:	learn: 0.3914779	total: 64.8ms	remaining: 9.19s
7:	learn: 0.3687688	total: 78.7ms	remaining: 9.76s
8:	learn: 0.3417559	total: 91ms	remaining: 10s
9:	learn: 0.3242768	total: 104ms	remaining: 10.3s
10:	learn: 0.3088440	total: 110ms	remaining: 9.92s
11:	learn: 0.2862951	total: 126ms	remaining: 10.4s
12:	learn: 0.2651894	total: 134ms	remaining: 10.1s
13:	learn: 0.2553128	total: 137ms	remaining: 9.63s
14:	learn: 0.2465593	total: 142ms	remaining: 9.3s
15:	learn: 0.2369225	total: 146ms	remaining: 8.98s
16:	learn: 0.2286891	total: 151ms	remaining: 8.73s
17:	learn: 0.2181641	total: 155ms	remaining: 8.47s
18:	learn: 0.2129031	total: 161ms	remaining: 8.29s
19:	learn

<catboost.core.CatBoostClassifier at 0x7fad9057b790>

In [22]:
pred = gbm.predict(X_test)

In [23]:
accuracy_score(y_test, pred)

0.9757436486659007

In [24]:
f1_score(y_test, pred)

0.6964856230031949

# Test Data

In [25]:
Xtest =  test[['campaign_var_1', 'campaign_var_2', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12']]

In [26]:
prediction = model.predict(Xtest)
prediction

array([1, 0, 0, ..., 0, 0, 0])

In [27]:
solution = pd.DataFrame({'id':test['id'],"buy":prediction})
solution.head()

Unnamed: 0,id,buy
0,39162,1
1,39163,0
2,39164,0
3,39165,1
4,39166,0


In [28]:
solution['buy'].value_counts()

0    12861
1      323
Name: buy, dtype: int64

In [30]:
solution.to_csv("Solution.csv",index=False)