In [1]:
import xgboost as xgb

In [2]:
import pandas as pd

In [3]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [4]:
from scipy import stats
from scipy.stats import randint

In [5]:
data_path = '..\\data\\'

In [6]:
dataset = pd.read_csv(data_path + 'train_val_dataset.csv')

In [7]:
dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8474661 entries, 0 to 8474660
Data columns (total 11 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   user_id                8474661 non-null  int64  
 1   product_id             8474661 non-null  int64  
 2   ipu_total_bought       8474661 non-null  int64  
 3   uxp_reorder_ratio      8474661 non-null  float64
 4   times_last5            8474661 non-null  float64
 5   u_total_orders         8474661 non-null  int64  
 6   u_reordered_ratio      8474661 non-null  float64
 7   last_order_size        8474661 non-null  int64  
 8   items_total_purchases  8474661 non-null  int64  
 9   items_reorder_ratio    8474661 non-null  float64
 10  reordered              8474661 non-null  float64
dtypes: float64(5), int64(6)
memory usage: 711.2 MB


In [8]:
dataset.head()

Unnamed: 0,user_id,product_id,ipu_total_bought,uxp_reorder_ratio,times_last5,u_total_orders,u_reordered_ratio,last_order_size,items_total_purchases,items_reorder_ratio,reordered
0,1,196,10,1.0,5.0,10,0.694915,9,35791,0.77648,1.0
1,1,10258,9,1.0,5.0,10,0.694915,9,1946,0.713772,1.0
2,1,10326,1,0.166667,0.0,10,0.694915,9,5526,0.652009,0.0
3,1,12427,10,1.0,5.0,10,0.694915,9,6476,0.740735,0.0
4,1,13032,3,0.333333,2.0,10,0.694915,9,3751,0.657158,1.0


In [9]:
dataset = dataset.drop(['user_id', 'product_id'], axis=1)

Разбиение датасета на обучение+валидацию и тест

In [10]:
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(dataset.drop(['reordered'], axis=1),
                                                                          dataset['reordered'].values,
                                                                          test_size=0.2,
                                                                          shuffle=True,
                                                                          random_state=42,
                                                                          stratify=dataset['reordered'].values)

In [11]:
features = dataset.drop(['reordered'], axis=1).columns.values

Подбор гиперпараметров случайным поиском на кросс-валидации

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': stats.randint(150, 1000),
              'learning_rate': stats.uniform(0.01, 0.6),
              'subsample': stats.uniform(0.3, 0.9),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.9),
              'min_child_weight': [1, 2, 3, 4]
             }

cv_obj = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist, 
                         n_iter = 5, scoring = 'f1', error_score = 0, 
                         verbose = 3, n_jobs = -1)
clf.fit(X_train_val[features], y_train_val)
print(clf.best_score_)
print(clf.best_params_)

Обучение модели

In [13]:
parameters = {'n_estimators': '200',
              'eval_metric':'logloss', 
              'max_depth':'5', 
              'colsample_bytree':'0.4',
              'subsample':'0.75',
              'learning_rate': '0.1',
              'min_child_weight': '2'
             }

xgbc = xgb.XGBClassifier(objective='binary:logistic', parameters=parameters, num_boost_round=10)

model = xgbc.fit(X_train_val, y_train_val)
y_pred_train = model.predict(X_train_val[features])
precision_value_tr = precision_score(y_train_val, y_pred_train)
recall_value_tr = recall_score(y_train_val, y_pred_train)
f1_score_value_tr = f1_score(y_train_val, y_pred_train)

y_pred_proba = model.predict_proba(X_holdout[features])
y_pred = model.predict(X_holdout[features])

precision_value = precision_score(y_holdout, y_pred)
recall_value = recall_score(y_holdout, y_pred)
f1_score_value = f1_score(y_holdout, y_pred)
roc_auc_score_value = roc_auc_score(y_holdout, y_pred_proba[:, 1])

print('Precision train = {0:.2f}; hold_out = {1:.2f}'.format(precision_value_tr, precision_value))
print('Recall train = {0:.2f}; hold_out = {1:.2f}'.format(recall_value_tr, recall_value))
print('F1 train = {0:.2f}; hold_out = {1:.2f}'.format(f1_score_value_tr, f1_score_value))
print('ROC_AUC = {0:.2f}'.format(roc_auc_score_value))



Parameters: { "num_boost_round", "parameters" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Precision train = 0.64; hold_out = 0.63
Recall train = 0.17; hold_out = 0.17
F1 train = 0.27; hold_out = 0.27
ROC_AUC = 0.83


Получение скора для тренировочной выборки

In [18]:
y_pred_proba_train = model.predict_proba(X_train_val[features])

Подбор порога опредления границы класса

In [20]:
for t in np.linspace(0.1, 0.9, num=9):
    y_pred_train = [1 if x >= t else 0 for x in y_pred_proba_train[:, 1]]
    precision_value_tr = precision_score(y_train_val, y_pred_train)
    recall_value_tr = recall_score(y_train_val, y_pred_train)
    f1_score_value_tr = f1_score(y_train_val, y_pred_train)
    
    y_pred = [1 if x >= t else 0 for x in y_pred_proba[:, 1]]
    precision_value = precision_score(y_holdout, y_pred)
    recall_value = recall_score(y_holdout, y_pred)
    f1_score_value = f1_score(y_holdout, y_pred)
    print('T={}'.format(t))
    print('Precision train = {0:.2f}; hold_out = {1:.2f}'.format(precision_value_tr, precision_value))
    print('Recall train = {0:.2f}; hold_out = {1:.2f}'.format(recall_value_tr, recall_value))
    print('F1 train = {0:.2f}; hold_out = {1:.2f}'.format(f1_score_value_tr, f1_score_value))
    print('===========')

T=0.1
Precision train = 0.25; hold_out = 0.25
Recall train = 0.74; hold_out = 0.74
F1 train = 0.38; hold_out = 0.37
T=0.2
Precision train = 0.37; hold_out = 0.37
Recall train = 0.52; hold_out = 0.52
F1 train = 0.43; hold_out = 0.43
T=0.30000000000000004
Precision train = 0.47; hold_out = 0.47
Recall train = 0.37; hold_out = 0.37
F1 train = 0.42; hold_out = 0.41
T=0.4
Precision train = 0.56; hold_out = 0.56
Recall train = 0.26; hold_out = 0.25
F1 train = 0.35; hold_out = 0.35
T=0.5
Precision train = 0.64; hold_out = 0.63
Recall train = 0.17; hold_out = 0.17
F1 train = 0.27; hold_out = 0.27
T=0.6
Precision train = 0.71; hold_out = 0.71
Recall train = 0.10; hold_out = 0.10
F1 train = 0.18; hold_out = 0.18
T=0.7000000000000001
Precision train = 0.79; hold_out = 0.78
Recall train = 0.05; hold_out = 0.05
F1 train = 0.10; hold_out = 0.09
T=0.8
Precision train = 0.86; hold_out = 0.84
Recall train = 0.02; hold_out = 0.02
F1 train = 0.03; hold_out = 0.03
T=0.9
Precision train = 0.97; hold_out = 

Обучение модели на всем тренировочном наборе

In [21]:
parameters = {'n_estimators': '200',
              'eval_metric':'logloss', 
              'max_depth':'5', 
              'colsample_bytree':'0.4',
              'subsample':'0.75',
              'learning_rate': '0.1',
              'min_child_weight': '2'
             }

xgbc = xgb.XGBClassifier(objective='binary:logistic', parameters=parameters, num_boost_round=10)

In [22]:
model = xgbc.fit(dataset.drop(['reordered'], axis=1), dataset['reordered'].values)



Parameters: { "num_boost_round", "parameters" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [23]:
test_dataset = pd.read_csv(data_path + 'test_dataset.csv')

In [25]:
y_pred_proba_test = model.predict_proba(test_dataset[features])

In [26]:
y_pred_test = [1 if x >= 0.25 else 0 for x in y_pred_proba_test[:, 1]]

In [27]:
test_dataset['prediction'] = y_pred_test

In [28]:
test_dataset.head()

Unnamed: 0,user_id,product_id,ipu_total_bought,uxp_reorder_ratio,times_last5,u_total_orders,u_reordered_ratio,last_order_size,items_total_purchases,items_reorder_ratio,prediction
0,3,248,1,0.090909,0.0,12,0.625,6,6371,0.400251,0
1,3,1005,1,0.333333,1.0,12,0.625,6,463,0.440605,0
2,3,1819,3,0.333333,0.0,12,0.625,6,2424,0.492162,0
3,3,7503,1,0.1,0.0,12,0.625,6,12474,0.553551,0
4,3,8021,1,0.090909,0.0,12,0.625,6,27864,0.591157,0


In [29]:
submit_data = test_dataset[['product_id', 'user_id', 'prediction']]

In [30]:
orders = pd.read_csv(data_path + 'orders.csv' )

In [31]:
orders_test = orders.loc[orders.eval_set=='test',("user_id", "order_id") ]
orders_test.head()

Unnamed: 0,user_id,order_id
38,3,2774568
44,4,329954
53,6,1528013
96,11,1376945
102,12,1356845


In [32]:
submit_data = submit_data.merge(orders_test, on='user_id', how='left')
submit_data.head()

Unnamed: 0,product_id,user_id,prediction,order_id
0,248,3,0,2774568
1,1005,3,0,2774568
2,1819,3,0,2774568
3,7503,3,0,2774568
4,8021,3,0,2774568


In [33]:
submit_data = submit_data.drop('user_id', axis=1)

In [34]:
submit_data['product_id'] = submit_data['product_id'].astype(int)

In [35]:
submit_data.head()

Unnamed: 0,product_id,prediction,order_id
0,248,0,2774568
1,1005,0,2774568
2,1819,0,2774568
3,7503,0,2774568
4,8021,0,2774568


Формирование сабмита

In [37]:
d = dict()
for row in submit_data.itertuples():
    if row.prediction== 1:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in submit_data.order_id:
    if order not in d:
        d[order] = 'None'

In [38]:
sub = pd.DataFrame.from_dict(d, orient='index')

#Reset index
sub.reset_index(inplace=True)
#Set column names
sub.columns = ['order_id', 'products']

sub.head()

Unnamed: 0,order_id,products
0,2774568,17668 18599 21903 39190 43961 47766
1,1528013,21903
2,1376945,8309 13176 14947 27959 28465 34658 35948 44632
3,1356845,7076 10863 13176 14992
4,2161313,196 12427 14715 27839 37710


In [39]:
sub.shape

(75000, 2)

In [40]:
sub.to_csv(data_path + 'submissions\\xgb.csv', index=False)

Полученный результат этого сабмита: 0.36653 Public; 0.36335 Private