In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from catboost import Pool, CatBoostClassifier
from tqdm import tqdm
root = Path("../input")

In [2]:
train = pd.read_csv(root.joinpath("train.csv"))
test = pd.read_csv(root.joinpath("test.csv"))

In [3]:
train_df = train.copy()
test_df = test.copy()
train_df.drop(columns=["ID_code", "target"], inplace=True)
test_df.drop(columns=["ID_code"], inplace=True)
target = train.target

In [4]:
 def augment_train(df_train, y_train):   
    t0 = df_train[y_train == 0].copy()
    t1 = df_train[y_train == 1].copy()
    i = 0
    N = 3
    for I in range(0):  # augment data into 2x
        for col in df_train.columns:
            i = i + 1000
            np.random.seed(i)
            np.random.shuffle(t0[col].values)
            np.random.shuffle(t1[col].values)
        df_train = pd.concat([df_train, t0.copy()])
        df_train = pd.concat([df_train, t1.copy()])
        y_train = pd.concat([y_train, pd.Series([0] * t0.shape[0]), pd.Series([1] * t1.shape[0])])
    return df_train, y_train

In [21]:
model = CatBoostClassifier(subsample=0.36, #rawdata 0.5  ×2 0.45 ×3 0.36
                            custom_loss='Logloss',
                           random_strength = 0,
                           max_depth=3,
                           eval_metric="AUC",
                           learning_rate=0.02,
                           iterations=60000,
                           #class_weights=[1,2],
                           bootstrap_type='Bernoulli',
                           #rsm=0.045,
                            l2_leaf_reg=0.3,
                           task_type="GPU",
                           random_seed=432013,
                           od_type="Iter",
                           border_count=128
                           #has_time= True 
                          )

In [6]:
 def run_cat(model,  trt, tst, tar,n_splits=5, plot=False):   
    kf = KFold(n_splits=n_splits, random_state=432013, shuffle=True)
    oof = np.zeros(len(trt))
    feature_importance_df = pd.DataFrame()
    y_valid_pred = 0 * tar
    y_test_pred = 0
    for n_fold, (train_index, valid_index) in enumerate(kf.split(trt, tar)):
        y_train, y_valid = tar.iloc[train_index], tar.iloc[valid_index]
        X_train, X_valid = trt.iloc[train_index,:], trt.iloc[valid_index,:]
        X_train, y_train = augment_train(X_train, y_train)
        X_train,X_valid = generate_fe(trn=X_train,tst=X_valid)
        _train = Pool(X_train, label=y_train)
        _valid = Pool(X_valid, label=y_valid)
        print( "Fold ", n_fold)
        fit_model = model.fit(_train,
                              verbose_eval=1000, 
                              early_stopping_rounds=1000,
                              eval_set=[_valid],
                              use_best_model=True,
                              plot=False,
                                            
                             )
        pred = fit_model.predict_proba(X_valid)[:,1]
        oof[valid_index] = pred
        print( "auc = ", roc_auc_score(y_valid, pred) )
        y_valid_pred.iloc[valid_index] = pred
        y_test_pred += fit_model.predict_proba(test_fe)[:,1]
    y_test_pred /= n_splits
    print("average auc:", roc_auc_score(tar, oof))
    return y_test_pred, oof

0.1： 

In [19]:
def generate_fe(trn, tst):
    #tst,target=augment_train(tst,y_train=target)
    real,syn = detect_test(test_df[features])
    al = pd.concat([trn,tst,test_df.iloc[real]],axis=0)
    for c in features:
        trn[c+"_test"]=trn[c].map(al[c].value_counts())
        trn[c+"_test"] = trn[c+"_test"]*trn[c]
        tst[c+"_test"]=tst[c].map(al[c].value_counts())
        tst[c+"_test"] = tst[c+"_test"]*tst[c]
    return trn, tst

In [8]:
features = [c for c in train_df.columns if c not in ["ID_code","target"]]

In [9]:
def detect_test(test_df):
    df_test=test_df.values
    unique_count = np.zeros_like(df_test)
    for feature in tqdm(range(df_test.shape[1])):
        _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
        unique_count[index_[count_ == 1], feature] += 1

    # Samples which have unique values are real the others are fake
    real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
    synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
    return real_samples_indexes,synthetic_samples_indexes

In [20]:
def generate_fe_test(tst):
    re,sy =  detect_test(tst[features])
    al = pd.concat([train_df,test_df.iloc[re]],axis=0)
    for c in features:
        tst[c+"_test"]=tst[c].map(al[c].value_counts())
        tst[c+"_test"] = tst[c+"_test"]*tst[c]
    return tst
test_fe = generate_fe_test(test_df[features])

100%|██████████| 200/200 [00:05<00:00, 36.59it/s]


In [None]:
y_test_pred, oof = run_cat(model,train_df, test_df, target)#0.5619

100%|██████████| 200/200 [00:05<00:00, 36.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

Fold  0
0:	learn: 0.5888016	test: 0.5893505	best: 0.5893505 (0)	total: 10ms	remaining: 10m
1000:	learn: 0.8681041	test: 0.8593479	best: 0.8593479 (1000)	total: 9.92s	remaining: 9m 44s
2000:	learn: 0.8948845	test: 0.8832502	best: 0.8832502 (2000)	total: 19.8s	remaining: 9m 32s
3000:	learn: 0.9064035	test: 0.8932089	best: 0.8932089 (3000)	total: 29.1s	remaining: 9m 12s
4000:	learn: 0.9136930	test: 0.8990362	best: 0.8990362 (4000)	total: 38.7s	remaining: 9m 1s
5000:	learn: 0.9187568	test: 0.9027106	best: 0.9027106 (5000)	total: 48.5s	remaining: 8m 53s
6000:	learn: 0.9226040	test: 0.9051123	best: 0.9051130 (5999)	total: 57.9s	remaining: 8m 40s
7000:	learn: 0.9258427	test: 0.9065894	best: 0.9065926 (6997)	total: 1m 7s	remaining: 8m 28s
8000:	learn: 0.9285458	test: 0.9078759	best: 0.9078759 (8000)	total: 1m 16s	remaining: 8m 17s
9000:	learn: 0.9309670	test: 0.9088680	best: 0.9088685 (8999)	total: 1m 25s	remaining: 8m 6s
10000:	learn: 0.9332166	test: 0.9096599	best: 0.9096622 (9998)	total: 1m

100%|██████████| 200/200 [00:05<00:00, 38.02it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

Fold  1
0:	learn: 0.5838959	test: 0.5862864	best: 0.5862864 (0)	total: 11.6ms	remaining: 11m 37s
1000:	learn: 0.8674328	test: 0.8618434	best: 0.8618434 (1000)	total: 9.37s	remaining: 9m 12s
2000:	learn: 0.8935156	test: 0.8862798	best: 0.8862798 (2000)	total: 18.7s	remaining: 9m 1s
3000:	learn: 0.9058127	test: 0.8969876	best: 0.8969876 (3000)	total: 29.6s	remaining: 9m 22s
4000:	learn: 0.9129488	test: 0.9024207	best: 0.9024207 (4000)	total: 39.3s	remaining: 9m 9s
5000:	learn: 0.9178894	test: 0.9060916	best: 0.9060916 (5000)	total: 48.9s	remaining: 8m 57s
6000:	learn: 0.9218625	test: 0.9082207	best: 0.9082207 (6000)	total: 58s	remaining: 8m 41s
7000:	learn: 0.9251400	test: 0.9099163	best: 0.9099181 (6999)	total: 1m 7s	remaining: 8m 29s
8000:	learn: 0.9279084	test: 0.9112275	best: 0.9112275 (8000)	total: 1m 16s	remaining: 8m 16s
9000:	learn: 0.9303537	test: 0.9122301	best: 0.9122301 (9000)	total: 1m 25s	remaining: 8m 5s
10000:	learn: 0.9326256	test: 0.9129178	best: 0.9129202 (9989)	total:

100%|██████████| 200/200 [00:05<00:00, 38.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

In [None]:
submission = pd.read_csv(root.joinpath("sample_submission.csv"))
submission['target'] = y_test_pred
pd.Series(oof).to_csv("Cat_oof.csv", index = False)
submission.to_csv('submission_cb_light_0.8999.csv', index=False)