reference:  
https://www.kaggle.com/pointerfly/easy-catboost

+ Catboost算法在进行预测的时候会导致memory limit超限的问题

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
import gc
import time
import itertools
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, log_loss, roc_auc_score

In [None]:
train_file = '../input/avazu-ctr-prediction/train.gz'
test_file = '../input/avazu-ctr-prediction/test.gz'

reader = pd.read_csv(train_file, chunksize=10**6, iterator=True) # 每个chunk一百万行数据
test = pd.read_csv(test_file, compression='gzip')

train = pd.DataFrame()
start = time.time()  

# 循环合并训练集，在每一轮循环的过程中对负样本下采样使样本平衡，总样本在800万行左右
# 最大限度扩充正样本信息量
for i, chunk in enumerate(reader): 
    chunk = chunk.sample(frac=.65, replace=False, random_state=516)
    neg_samp = chunk[chunk['click'] == 0].sample(n=len(chunk[chunk['click'] == 1]), random_state=2021)
    train = pd.concat([train, neg_samp, chunk[chunk['click'] == 1]], axis=0)  
    if i % 20 == 0:
        print('Processing Chunk No. {}'.format(i))

print('the program costs %.2f seconds'%(time.time() - start))

del neg_samp
gc.collect()

print('train has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('test has {} rows and {} columns'.format(test.shape[0], test.shape[1]))
train['click'].value_counts()

In [None]:
# train.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]] 

# 观察一下训练集和测试集的分布情况
# 可以看出，测试集枚举少于训练集。说明训练集完全包含了测试集的枚举。不存在训练集和测试集分布不一致的情况。
ls = list(test.columns)
#ls.remove('click')
for i in ls:
    train_len = len(train[i].astype(str).value_counts())
    test_len = len(test[i].astype(str).value_counts())
    print(i, 'train: {} test：{}'.format(train_len, test_len))

In [None]:
y_train = train[['click']]
x_train = train.drop(['click','id'], axis=1)
x_test = test.drop(['id'], axis=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, stratify=y_train, random_state=256)
del train
gc.collect()

x_train.info()

In [None]:
model = CatBoostClassifier(
    iterations=25,
    learning_rate=0.9,
    task_type='GPU',
    loss_function='Logloss',
     gpu_ram_part=0.9,
     boosting_type='Plain',
     max_ctr_complexity=2,
     depth=6,
     gpu_cat_features_storage='CpuPinnedMemory'
)

cat_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

model.fit(
    x_train, y_train,
    eval_set=(x_val, y_val),
    cat_features=cat_features,
    verbose=10
)

In [None]:
def plot_catboost(y_label):
    '''plot catboost learning curve
    '''
    learn_error = pd.read_csv('./catboost_info/learn_error.tsv', sep='\t')
    test_error = pd.read_csv('./catboost_info/test_error.tsv', sep='\t')
    metric = pd.concat([learn_error, test_error.iloc[:,1]], axis=1)
    metric.columns = ['iterations','learn','test']
    plt.rcParams['figure.facecolor'] = 'white'
    metric.plot(x='iterations',y=['learn','test'])
    plt.ylabel(y_label)
    plt.show()

plot_catboost('logloss')

In [None]:
#del x_train, y_train
gc.collect()

In [None]:
y_test = model.predict(x_test, 
                       prediction_type='Probability', 
                       ntree_end=model.get_best_iteration(), 
                       thread_count=-1,
                       verbose=None)
gc.collect()
y_val_pred = model.predict(x_val, 
                       prediction_type='Probability', 
                       ntree_start=0,
                       ntree_end=model.get_best_iteration(), 
                       thread_count=-1,
                       verbose=None)

In [None]:
#y_val_pred = model.predict_proba(x_val)[:,1]
y_val_class = np.where(y_val_pred[:, 1] > 0.5,1,0)
print('Out of folds logloss is {:.4f}'.format(log_loss(y_val, y_val_pred[:, 1])))
print('Out of folds roc_auc_score is {:.4f}'.format(roc_auc_score(y_val, y_val_pred[:, 1])))

In [None]:
#del x_val
gc.collect()

In [None]:
# submission_df["click"] = 
submission = pd.read_csv('../input/avazu-ctr-prediction/sampleSubmission.gz', compression='gzip')
submission = pd.DataFrame({'id': submission['id'], 'click': y_test[:, 1]})
submission.to_csv('submission.csv',index = False)
submission.head()

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix',cmap=plt.cm.Blues):
    '''This function prints and plots the confusion matrix.
       Normalization can be applied by setting `normalize=True`.
    '''
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plt.rcParams['figure.facecolor'] = 'white'
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_val, y_val_class),
                      classes=class_names, 
                      normalize=True, 
                      title='Normalized Confusion Matrix: Xgboost')