In [1]:
print(open("run_mice.R").read())

library(mice)

args <- commandArgs(trailingOnly=TRUE)
infile = args[1]
outfile = args[2]

data_with_nan <- read.csv(infile, sep=",")
data <- complete(mice(data_with_nan, printFlag=FALSE, method = "cart"))

write.csv(data, file=outfile, row.names=FALSE, col.names=FALSE, sep=",")



### Note: use read.table and write.table instead of read.csv and write.csv.

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from imblearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB


In [15]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

X = df_train.drop(['target', 'id'], axis=1)
y = df_train['target']

X_test = df_test.drop('id', axis=1)



In [4]:
hex_columns = ['f2', 'f3', 'f13', 'f18', 'f20', 'f26']
ordinal_columns = ['f6', 'f4', 'f8', 'f16', 'f17', 'f19', 'f21', 'f25']
categorical_columns = ['f1', 'f5', 'f7', 'f9', 'f11', 'f24']
ordinal_cat_columns = ['f0', 'f12', 'f23', 'f27']
binary_columns = ['f10', 'f28', 'f22', 'f14']
all_columns = ['f' + str(i) for i in range(0, 29)]
removed_cols = []

In [17]:
def conv_hex(df: pd.DataFrame) -> None:
    def conv_hex_map(x):
        try:
            return int(x, 16)
        except ValueError as e:
            return np.nan
        except TypeError as e:
            return np.nan

    for col in hex_columns:
        if col not in list(df.columns):
            continue
        col_loc = list(df.columns).index(col)
        df[col] = df[col].apply(lambda x: conv_hex_map(x))


def conv_bool(df: pd.DataFrame):
    def conv_bool_map(x):
        try:
            if not type(x) == str:
                return x
            if x.lower() == 'f':
                return 0
            elif x.lower() == 't':
                return 1
            return x
        except Exception as e:
            return np.nan
    df['f14'] = df['f14'].apply(lambda x: conv_bool_map(x))


def conv_binary(df: pd.DataFrame):
    def binaryToDecimal(binary):
        try:
            binary = int(binary)
        except:
            return np.nan

        binary1 = binary
        decimal, i, n = 0, 0, 0
        while(binary != 0):
            dec = binary % 10
            decimal = decimal + dec * pow(2, i)
            binary = binary//10
            i += 1
        return decimal
    for col in binary_columns:
        if col not in list(df.columns):
            continue
    df[col] = df[col].apply(lambda x: binaryToDecimal(x))
    
def remove_duplicate_columns(df: pd.DataFrame) -> None:
    cols_to_drop = []
    cols = list(df.columns)
    for col in df.columns:
        if col in cols:
            cols.remove(col)
        for col2 in cols:
            if df[col].equals(df[col2]):
                cols_to_drop.append(col2)

    df.drop(columns=cols_to_drop, inplace=True)

def fix_f9(df: pd.DataFrame) -> None:
    def test(x : str):
        if type(x) != str:
            return x
        if len(x) != 2:
            return x
        return x[0] + " " + x[1]

    df['f9'] = df['f9'].apply(lambda x: test(x))
    df[['f9_1', 'f9_2']] = df['f9'].str.split(' ', 1, expand=True)

    df.drop(columns=['f9'], inplace=True)

    
def transform_categorical(df: pd.DataFrame, test=False) -> None:
    cols = list(set(df.columns).intersection(set(categorical_columns).union(set(ordinal_cat_columns)))) + ['f9_1', 'f9_2']
    for col in cols:
        df[col] = df[col].astype('category')
        df[col] = df[col].cat.codes

def remove_duplicates(df: pd.DataFrame) -> None:
    df.drop_duplicates(inplace=True)
    
def conv_columns(df: pd.DataFrame, test=False) -> None:
    conv_hex(df)
    conv_bool(df)
    conv_binary(df)
    fix_f9(df)
    return transform_categorical(df, test)

In [17]:
def run_mice(infile, outfile):
    # np.savetxt(infile, data.to_numpy().tolist(), delimiter=",")
    os.system('"C:\Program Files\R\R-4.1.1\bin\Rscript.exe" --vanilla run_mice.R %s %s' % (infile, outfile))
    data_imputed = pd.read_csv(outfile)
    print(data_imputed.head())

In [6]:
def preprocess(df, test=False):
    if test:
        df.loc[26648, 'f9'] = np.nan
        df.loc[20956, 'f15'] = np.nan
        df.loc[21034, 'f15'] = np.nan
        
    remove_duplicate_columns(df)
    remove_duplicates(df)

    conv_columns(df, test)
    
    if test:
        df.to_csv('./test_processed_f9_split.csv', index=False)
    else:
        df.to_csv('./train_processed_f9_split.csv', index=False)

In [18]:
preprocess(df_train)
preprocess(df_test, True)

In [20]:
df_test.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f21,f22,f23,f24,f25,f26,f27,f28,f9_1,f9_2
0,50000,23,-1,44081270000.0,67744950000.0,7.0036,0,0.0,2,5.0,...,3.0,1.0,4,0,104.0,,14,1.0,18,49
1,50001,2,1,,24545580000.0,6.952,0,0.0,2,4.0,...,1.0,0.0,4,0,20.0,12951280000.0,7,0.0,1,11
2,50002,16,1,56770980000.0,29071570000.0,6.8457,0,0.0,2,1.0,...,2.0,0.0,5,0,28.0,8959571000.0,10,0.0,19,25
3,50003,7,-1,67501890000.0,29509480000.0,6.9464,1,0.0,2,4.0,...,2.0,1.0,4,1,64.0,7502526000.0,0,1.0,0,6
4,50004,2,-1,34496320000.0,34981760000.0,7.24,0,1.0,1,1.0,...,1.0,1.0,0,0,76.0,35832720000.0,3,1.0,17,49


In [18]:
run_mice('./train_processed_f9_split.csv', './train_imputed_f9_split.csv')
run_mice('./test_processed_f9_split.csv', './test_imputed_f9_split.csv')

FileNotFoundError: [Errno 2] No such file or directory: './train_imputed_f9_split.csv'

In [None]:
round(100*(X_test_pre.isnull().sum()/len(X_test_pre.index)),2)

In [None]:
from imblearn.over_sampling import SMOTE

for col in X_train_pre:
    if X_train_pre[col].isnull().sum() > 0:
        X_train_pre[col].fillna(X_train_pre[col].mean(), inplace=True)

sm = SMOTE(sampling_strategy='minority', random_state=7)
X_cols = X_train_pre.columns

X_train_pre, y_train_pre = sm.fit_resample(X_train_pre, y_train)

X_train = pd.DataFrame(X_train, columns=X_cols)

y_train.value_counts()

In [None]:
X_test.head()

In [23]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def split(X : pd.DataFrame, y : pd.DataFrame, size=.2):
    return train_test_split(X, y, test_size=size, stratify=y)

def fit(data : pd.DataFrame, labels : pd.DataFrame, classifier : any, eval_pool = None) -> None:
    if eval_pool is not None:
        classifier.fit(data, labels, eval_set=eval_pool)
    else:
        classifier.fit(data, labels)

def predict_proba(classifier : any, test : pd.DataFrame) -> np.array:
    return classifier.predict_proba(test)

def predict(classifier : any, test : pd.DataFrame) -> np.array:
    return classifier.predict(test)

def print_accuracy(pred : np.array, test : pd.DataFrame, name : str):
    print(name + ' Model accuracy score: {0:0.4f}'.format(roc_auc_score(test, pred[:, 1])))

In [42]:
imp_data = pd.read_csv('train_imputed_f9_split.csv')

X_train_pre = imp_data.drop(['target', 'id'], axis=1)
y_train = imp_data['target']

df_test = pd.read_csv('test_imputed_f9_split.csv')
X_test_pre = df_test.drop('id', axis=1)

train_cat = list(set(X_train_pre.columns).intersection(set(categorical_columns)))

for col in X_train_pre:
    if X_train_pre[col].isnull().sum() > 0:
        X_train_pre[col].fillna(method='bfill', inplace=True)
    
    # if col in train_cat:
        # X_train_pre[col] = X_train_pre[col].astype('category')
        
for col in X_test_pre:
    if X_test_pre[col].isnull().sum() > 0:
        X_test_pre[col].fillna(method='bfill', inplace=True)
    
    # if col in train_cat:
        # X_train_pre[col] = X_train_pre[col].astype('category')
        
remove_duplicate_columns(X_train_pre)
remove_duplicate_columns(X_test_pre)

scaler = StandardScaler()

cols_to_transform = hex_columns # list(set(X_train_pre.columns) - set(train_cat))

X_train_pre[cols_to_transform] = scaler.fit_transform(X_train_pre[cols_to_transform], y_train)
X_test_pre[cols_to_transform] = scaler.transform(X_test_pre[cols_to_transform])

X_train_split, X_test_split, y_train_split, y_test_split = split(X_train_pre, y_train, size=.01)
        
# X_train_pre.head()

In [50]:
from catboost import CatBoostClassifier, Pool

train_cat = list(set(X_train_split.columns).intersection(set(categorical_columns)))
test_cat = list(set(X_test_split.columns).intersection(set(categorical_columns)))
cats = []
for col in train_cat:
    cats.append(X_train_pre.columns.get_loc(col))
    
print(cats)
print(train_cat)
print(test_cat)

train_dataset = Pool(X_train_split ,y_train_split, cat_features=train_cat)
test_dataset = Pool(X_test_split, y_test_split, cat_features=test_cat)

# model = CatBoostClassifier(loss_function='Logloss', eval_metric='AUC')

eval_pool = Pool(X_test_split, y_test_split)

# clf = CatBoostClassifier(loss_function='Logloss',cat_features=cats,eval_metric= 'AUC',depth= 1,learning_rate= 1,l2_leaf_reg= 5,iterations= 2000)

clf = CatBoostClassifier(depth=1, learning_rate=1, iterations=2000, l2_leaf_reg = 1e-20, leaf_estimation_iterations=10, loss_function= 'CrossEntropy')

fit(X_train_pre, y_train, clf, eval_pool)

[7, 5, 1, 21]
['f7', 'f5', 'f1', 'f24']
['f7', 'f5', 'f1', 'f24']
0:	learn: 0.4671843	test: 0.4765326	best: 0.4765326 (0)	total: 4.84ms	remaining: 9.68s
1:	learn: 0.4615264	test: 0.4698634	best: 0.4698634 (1)	total: 9.58ms	remaining: 9.57s
2:	learn: 0.4566704	test: 0.4694074	best: 0.4694074 (2)	total: 13.9ms	remaining: 9.27s
3:	learn: 0.4533589	test: 0.4643557	best: 0.4643557 (3)	total: 18.7ms	remaining: 9.32s
4:	learn: 0.4488103	test: 0.4533725	best: 0.4533725 (4)	total: 23.2ms	remaining: 9.25s
5:	learn: 0.4457836	test: 0.4473172	best: 0.4473172 (5)	total: 27.6ms	remaining: 9.18s
6:	learn: 0.4430572	test: 0.4452849	best: 0.4452849 (6)	total: 32.4ms	remaining: 9.22s
7:	learn: 0.4408195	test: 0.4465044	best: 0.4452849 (6)	total: 37ms	remaining: 9.21s
8:	learn: 0.4386527	test: 0.4386323	best: 0.4386323 (8)	total: 41.6ms	remaining: 9.2s
9:	learn: 0.4373696	test: 0.4390536	best: 0.4386323 (8)	total: 46ms	remaining: 9.15s
10:	learn: 0.4361180	test: 0.4361678	best: 0.4361678 (10)	total: 50.5

111:	learn: 0.4136418	test: 0.3993942	best: 0.3991331 (94)	total: 511ms	remaining: 8.62s
112:	learn: 0.4135998	test: 0.4000829	best: 0.3991331 (94)	total: 516ms	remaining: 8.61s
113:	learn: 0.4135636	test: 0.3998795	best: 0.3991331 (94)	total: 520ms	remaining: 8.61s
114:	learn: 0.4135201	test: 0.3994968	best: 0.3991331 (94)	total: 526ms	remaining: 8.61s
115:	learn: 0.4134737	test: 0.3999910	best: 0.3991331 (94)	total: 531ms	remaining: 8.62s
116:	learn: 0.4134377	test: 0.3995758	best: 0.3991331 (94)	total: 535ms	remaining: 8.61s
117:	learn: 0.4134052	test: 0.3993023	best: 0.3991331 (94)	total: 540ms	remaining: 8.61s
118:	learn: 0.4133652	test: 0.3983792	best: 0.3983792 (118)	total: 545ms	remaining: 8.62s
119:	learn: 0.4133266	test: 0.3982502	best: 0.3982502 (119)	total: 550ms	remaining: 8.62s
120:	learn: 0.4132855	test: 0.3993797	best: 0.3982502 (119)	total: 556ms	remaining: 8.63s
121:	learn: 0.4132238	test: 0.3983976	best: 0.3982502 (119)	total: 561ms	remaining: 8.63s
122:	learn: 0.413

219:	learn: 0.4101867	test: 0.3954340	best: 0.3952665 (217)	total: 1.06s	remaining: 8.56s
220:	learn: 0.4101647	test: 0.3951333	best: 0.3951333 (220)	total: 1.06s	remaining: 8.55s
221:	learn: 0.4101455	test: 0.3951510	best: 0.3951333 (220)	total: 1.07s	remaining: 8.55s
222:	learn: 0.4101297	test: 0.3952528	best: 0.3951333 (220)	total: 1.07s	remaining: 8.55s
223:	learn: 0.4101144	test: 0.3956094	best: 0.3951333 (220)	total: 1.08s	remaining: 8.55s
224:	learn: 0.4100876	test: 0.3951683	best: 0.3951333 (220)	total: 1.08s	remaining: 8.55s
225:	learn: 0.4100717	test: 0.3946292	best: 0.3946292 (225)	total: 1.09s	remaining: 8.55s
226:	learn: 0.4100535	test: 0.3948411	best: 0.3946292 (225)	total: 1.09s	remaining: 8.55s
227:	learn: 0.4100381	test: 0.3948051	best: 0.3946292 (225)	total: 1.1s	remaining: 8.55s
228:	learn: 0.4100080	test: 0.3948835	best: 0.3946292 (225)	total: 1.1s	remaining: 8.55s
229:	learn: 0.4099912	test: 0.3947469	best: 0.3946292 (225)	total: 1.11s	remaining: 8.55s
230:	learn: 

324:	learn: 0.4082615	test: 0.3954414	best: 0.3937056 (280)	total: 1.6s	remaining: 8.25s
325:	learn: 0.4082504	test: 0.3954001	best: 0.3937056 (280)	total: 1.61s	remaining: 8.25s
326:	learn: 0.4082316	test: 0.3953849	best: 0.3937056 (280)	total: 1.61s	remaining: 8.24s
327:	learn: 0.4082176	test: 0.3950122	best: 0.3937056 (280)	total: 1.62s	remaining: 8.24s
328:	learn: 0.4082030	test: 0.3950775	best: 0.3937056 (280)	total: 1.62s	remaining: 8.23s
329:	learn: 0.4081902	test: 0.3950778	best: 0.3937056 (280)	total: 1.63s	remaining: 8.23s
330:	learn: 0.4081719	test: 0.3950799	best: 0.3937056 (280)	total: 1.63s	remaining: 8.22s
331:	learn: 0.4081567	test: 0.3954313	best: 0.3937056 (280)	total: 1.64s	remaining: 8.21s
332:	learn: 0.4081417	test: 0.3952925	best: 0.3937056 (280)	total: 1.64s	remaining: 8.21s
333:	learn: 0.4081294	test: 0.3955363	best: 0.3937056 (280)	total: 1.64s	remaining: 8.2s
334:	learn: 0.4081206	test: 0.3955550	best: 0.3937056 (280)	total: 1.65s	remaining: 8.2s
335:	learn: 0

435:	learn: 0.4069315	test: 0.3943566	best: 0.3937056 (280)	total: 2.15s	remaining: 7.71s
436:	learn: 0.4069237	test: 0.3945807	best: 0.3937056 (280)	total: 2.15s	remaining: 7.7s
437:	learn: 0.4069173	test: 0.3946579	best: 0.3937056 (280)	total: 2.16s	remaining: 7.7s
438:	learn: 0.4069080	test: 0.3948256	best: 0.3937056 (280)	total: 2.16s	remaining: 7.7s
439:	learn: 0.4068998	test: 0.3945363	best: 0.3937056 (280)	total: 2.17s	remaining: 7.69s
440:	learn: 0.4068889	test: 0.3942087	best: 0.3937056 (280)	total: 2.17s	remaining: 7.69s
441:	learn: 0.4068798	test: 0.3941548	best: 0.3937056 (280)	total: 2.18s	remaining: 7.68s
442:	learn: 0.4068693	test: 0.3944658	best: 0.3937056 (280)	total: 2.18s	remaining: 7.68s
443:	learn: 0.4068579	test: 0.3943981	best: 0.3937056 (280)	total: 2.19s	remaining: 7.67s
444:	learn: 0.4068475	test: 0.3939635	best: 0.3937056 (280)	total: 2.19s	remaining: 7.67s
445:	learn: 0.4068396	test: 0.3939341	best: 0.3937056 (280)	total: 2.2s	remaining: 7.67s
446:	learn: 0.

545:	learn: 0.4059147	test: 0.3935346	best: 0.3929903 (535)	total: 2.69s	remaining: 7.18s
546:	learn: 0.4059080	test: 0.3933850	best: 0.3929903 (535)	total: 2.7s	remaining: 7.17s
547:	learn: 0.4059002	test: 0.3934875	best: 0.3929903 (535)	total: 2.71s	remaining: 7.17s
548:	learn: 0.4058912	test: 0.3935124	best: 0.3929903 (535)	total: 2.71s	remaining: 7.17s
549:	learn: 0.4058844	test: 0.3933132	best: 0.3929903 (535)	total: 2.72s	remaining: 7.16s
550:	learn: 0.4058761	test: 0.3931267	best: 0.3929903 (535)	total: 2.72s	remaining: 7.16s
551:	learn: 0.4058693	test: 0.3931577	best: 0.3929903 (535)	total: 2.73s	remaining: 7.15s
552:	learn: 0.4058590	test: 0.3930794	best: 0.3929903 (535)	total: 2.73s	remaining: 7.15s
553:	learn: 0.4058488	test: 0.3927280	best: 0.3927280 (553)	total: 2.74s	remaining: 7.15s
554:	learn: 0.4058342	test: 0.3930995	best: 0.3927280 (553)	total: 2.74s	remaining: 7.14s
555:	learn: 0.4058261	test: 0.3932798	best: 0.3927280 (553)	total: 2.75s	remaining: 7.14s
556:	learn:

656:	learn: 0.4050695	test: 0.3930798	best: 0.3926669 (600)	total: 3.24s	remaining: 6.63s
657:	learn: 0.4050633	test: 0.3930725	best: 0.3926669 (600)	total: 3.25s	remaining: 6.62s
658:	learn: 0.4050578	test: 0.3932129	best: 0.3926669 (600)	total: 3.25s	remaining: 6.62s
659:	learn: 0.4050508	test: 0.3932412	best: 0.3926669 (600)	total: 3.26s	remaining: 6.61s
660:	learn: 0.4050434	test: 0.3929458	best: 0.3926669 (600)	total: 3.26s	remaining: 6.61s
661:	learn: 0.4050375	test: 0.3928418	best: 0.3926669 (600)	total: 3.27s	remaining: 6.6s
662:	learn: 0.4050307	test: 0.3927770	best: 0.3926669 (600)	total: 3.27s	remaining: 6.6s
663:	learn: 0.4050230	test: 0.3927647	best: 0.3926669 (600)	total: 3.28s	remaining: 6.59s
664:	learn: 0.4050148	test: 0.3927740	best: 0.3926669 (600)	total: 3.28s	remaining: 6.59s
665:	learn: 0.4050067	test: 0.3928766	best: 0.3926669 (600)	total: 3.29s	remaining: 6.58s
666:	learn: 0.4049981	test: 0.3927527	best: 0.3926669 (600)	total: 3.29s	remaining: 6.58s
667:	learn: 

763:	learn: 0.4043846	test: 0.3919151	best: 0.3914543 (757)	total: 3.79s	remaining: 6.12s
764:	learn: 0.4043769	test: 0.3920308	best: 0.3914543 (757)	total: 3.79s	remaining: 6.12s
765:	learn: 0.4043729	test: 0.3921210	best: 0.3914543 (757)	total: 3.79s	remaining: 6.11s
766:	learn: 0.4043668	test: 0.3919936	best: 0.3914543 (757)	total: 3.8s	remaining: 6.11s
767:	learn: 0.4043600	test: 0.3919240	best: 0.3914543 (757)	total: 3.8s	remaining: 6.1s
768:	learn: 0.4043553	test: 0.3918944	best: 0.3914543 (757)	total: 3.81s	remaining: 6.1s
769:	learn: 0.4043497	test: 0.3919406	best: 0.3914543 (757)	total: 3.81s	remaining: 6.09s
770:	learn: 0.4043431	test: 0.3919636	best: 0.3914543 (757)	total: 3.82s	remaining: 6.09s
771:	learn: 0.4043375	test: 0.3920709	best: 0.3914543 (757)	total: 3.82s	remaining: 6.08s
772:	learn: 0.4043332	test: 0.3921493	best: 0.3914543 (757)	total: 3.83s	remaining: 6.08s
773:	learn: 0.4043269	test: 0.3919340	best: 0.3914543 (757)	total: 3.83s	remaining: 6.07s
774:	learn: 0.

879:	learn: 0.4037616	test: 0.3913984	best: 0.3911250 (806)	total: 4.34s	remaining: 5.52s
880:	learn: 0.4037573	test: 0.3911631	best: 0.3911250 (806)	total: 4.34s	remaining: 5.51s
881:	learn: 0.4037519	test: 0.3912751	best: 0.3911250 (806)	total: 4.35s	remaining: 5.51s
882:	learn: 0.4037461	test: 0.3910312	best: 0.3910312 (882)	total: 4.35s	remaining: 5.5s
883:	learn: 0.4037436	test: 0.3909937	best: 0.3909937 (883)	total: 4.36s	remaining: 5.5s
884:	learn: 0.4037407	test: 0.3909863	best: 0.3909863 (884)	total: 4.36s	remaining: 5.49s
885:	learn: 0.4037348	test: 0.3912216	best: 0.3909863 (884)	total: 4.37s	remaining: 5.49s
886:	learn: 0.4037289	test: 0.3911531	best: 0.3909863 (884)	total: 4.37s	remaining: 5.49s
887:	learn: 0.4037229	test: 0.3912587	best: 0.3909863 (884)	total: 4.38s	remaining: 5.48s
888:	learn: 0.4037156	test: 0.3910185	best: 0.3909863 (884)	total: 4.38s	remaining: 5.47s
889:	learn: 0.4037097	test: 0.3911733	best: 0.3909863 (884)	total: 4.38s	remaining: 5.47s
890:	learn: 

990:	learn: 0.4032322	test: 0.3913687	best: 0.3909863 (884)	total: 4.88s	remaining: 4.97s
991:	learn: 0.4032274	test: 0.3914541	best: 0.3909863 (884)	total: 4.89s	remaining: 4.97s
992:	learn: 0.4032221	test: 0.3913098	best: 0.3909863 (884)	total: 4.89s	remaining: 4.96s
993:	learn: 0.4032171	test: 0.3915253	best: 0.3909863 (884)	total: 4.9s	remaining: 4.96s
994:	learn: 0.4032119	test: 0.3914253	best: 0.3909863 (884)	total: 4.9s	remaining: 4.95s
995:	learn: 0.4032071	test: 0.3913596	best: 0.3909863 (884)	total: 4.91s	remaining: 4.95s
996:	learn: 0.4032028	test: 0.3914998	best: 0.3909863 (884)	total: 4.91s	remaining: 4.94s
997:	learn: 0.4031993	test: 0.3914655	best: 0.3909863 (884)	total: 4.92s	remaining: 4.94s
998:	learn: 0.4031947	test: 0.3914710	best: 0.3909863 (884)	total: 4.92s	remaining: 4.93s
999:	learn: 0.4031897	test: 0.3913748	best: 0.3909863 (884)	total: 4.93s	remaining: 4.93s
1000:	learn: 0.4031854	test: 0.3914390	best: 0.3909863 (884)	total: 4.93s	remaining: 4.92s
1001:	learn

1105:	learn: 0.4027173	test: 0.3919801	best: 0.3909200 (1023)	total: 5.43s	remaining: 4.39s
1106:	learn: 0.4027119	test: 0.3920182	best: 0.3909200 (1023)	total: 5.43s	remaining: 4.38s
1107:	learn: 0.4027072	test: 0.3918263	best: 0.3909200 (1023)	total: 5.44s	remaining: 4.38s
1108:	learn: 0.4027024	test: 0.3918128	best: 0.3909200 (1023)	total: 5.44s	remaining: 4.37s
1109:	learn: 0.4026976	test: 0.3917901	best: 0.3909200 (1023)	total: 5.45s	remaining: 4.37s
1110:	learn: 0.4026931	test: 0.3918246	best: 0.3909200 (1023)	total: 5.45s	remaining: 4.36s
1111:	learn: 0.4026892	test: 0.3917123	best: 0.3909200 (1023)	total: 5.46s	remaining: 4.36s
1112:	learn: 0.4026849	test: 0.3919157	best: 0.3909200 (1023)	total: 5.46s	remaining: 4.35s
1113:	learn: 0.4026811	test: 0.3916960	best: 0.3909200 (1023)	total: 5.47s	remaining: 4.35s
1114:	learn: 0.4026780	test: 0.3915679	best: 0.3909200 (1023)	total: 5.47s	remaining: 4.34s
1115:	learn: 0.4026734	test: 0.3916295	best: 0.3909200 (1023)	total: 5.48s	remai

1214:	learn: 0.4022717	test: 0.3914053	best: 0.3909200 (1023)	total: 5.98s	remaining: 3.86s
1215:	learn: 0.4022685	test: 0.3915619	best: 0.3909200 (1023)	total: 5.98s	remaining: 3.86s
1216:	learn: 0.4022639	test: 0.3914340	best: 0.3909200 (1023)	total: 5.99s	remaining: 3.85s
1217:	learn: 0.4022597	test: 0.3912727	best: 0.3909200 (1023)	total: 5.99s	remaining: 3.85s
1218:	learn: 0.4022570	test: 0.3913234	best: 0.3909200 (1023)	total: 6s	remaining: 3.84s
1219:	learn: 0.4022545	test: 0.3912934	best: 0.3909200 (1023)	total: 6s	remaining: 3.84s
1220:	learn: 0.4022513	test: 0.3914527	best: 0.3909200 (1023)	total: 6.01s	remaining: 3.83s
1221:	learn: 0.4022470	test: 0.3915929	best: 0.3909200 (1023)	total: 6.01s	remaining: 3.83s
1222:	learn: 0.4022441	test: 0.3914783	best: 0.3909200 (1023)	total: 6.02s	remaining: 3.82s
1223:	learn: 0.4022398	test: 0.3913567	best: 0.3909200 (1023)	total: 6.02s	remaining: 3.82s
1224:	learn: 0.4022362	test: 0.3914369	best: 0.3909200 (1023)	total: 6.03s	remaining: 

1330:	learn: 0.4018598	test: 0.3902005	best: 0.3902005 (1330)	total: 6.53s	remaining: 3.28s
1331:	learn: 0.4018579	test: 0.3902033	best: 0.3902005 (1330)	total: 6.53s	remaining: 3.27s
1332:	learn: 0.4018543	test: 0.3901703	best: 0.3901703 (1332)	total: 6.54s	remaining: 3.27s
1333:	learn: 0.4018491	test: 0.3902358	best: 0.3901703 (1332)	total: 6.54s	remaining: 3.27s
1334:	learn: 0.4018455	test: 0.3902357	best: 0.3901703 (1332)	total: 6.55s	remaining: 3.26s
1335:	learn: 0.4018429	test: 0.3904006	best: 0.3901703 (1332)	total: 6.55s	remaining: 3.26s
1336:	learn: 0.4018405	test: 0.3902275	best: 0.3901703 (1332)	total: 6.56s	remaining: 3.25s
1337:	learn: 0.4018371	test: 0.3902072	best: 0.3901703 (1332)	total: 6.56s	remaining: 3.25s
1338:	learn: 0.4018351	test: 0.3901535	best: 0.3901535 (1338)	total: 6.57s	remaining: 3.24s
1339:	learn: 0.4018309	test: 0.3901570	best: 0.3901535 (1338)	total: 6.57s	remaining: 3.24s
1340:	learn: 0.4018259	test: 0.3902471	best: 0.3901535 (1338)	total: 6.58s	remai

1436:	learn: 0.4015132	test: 0.3897266	best: 0.3895755 (1388)	total: 7.07s	remaining: 2.77s
1437:	learn: 0.4015104	test: 0.3896586	best: 0.3895755 (1388)	total: 7.08s	remaining: 2.77s
1438:	learn: 0.4015064	test: 0.3895930	best: 0.3895755 (1388)	total: 7.08s	remaining: 2.76s
1439:	learn: 0.4015038	test: 0.3895467	best: 0.3895467 (1439)	total: 7.09s	remaining: 2.75s
1440:	learn: 0.4015012	test: 0.3895874	best: 0.3895467 (1439)	total: 7.09s	remaining: 2.75s
1441:	learn: 0.4014992	test: 0.3894467	best: 0.3894467 (1441)	total: 7.1s	remaining: 2.75s
1442:	learn: 0.4014954	test: 0.3893943	best: 0.3893943 (1442)	total: 7.1s	remaining: 2.74s
1443:	learn: 0.4014921	test: 0.3893244	best: 0.3893244 (1443)	total: 7.11s	remaining: 2.74s
1444:	learn: 0.4014878	test: 0.3894340	best: 0.3893244 (1443)	total: 7.11s	remaining: 2.73s
1445:	learn: 0.4014853	test: 0.3892782	best: 0.3892782 (1445)	total: 7.12s	remaining: 2.73s
1446:	learn: 0.4014814	test: 0.3896810	best: 0.3892782 (1445)	total: 7.12s	remaini

1546:	learn: 0.4011499	test: 0.3893812	best: 0.3890908 (1475)	total: 7.62s	remaining: 2.23s
1547:	learn: 0.4011454	test: 0.3895478	best: 0.3890908 (1475)	total: 7.62s	remaining: 2.23s
1548:	learn: 0.4011422	test: 0.3894787	best: 0.3890908 (1475)	total: 7.63s	remaining: 2.22s
1549:	learn: 0.4011387	test: 0.3892789	best: 0.3890908 (1475)	total: 7.63s	remaining: 2.22s
1550:	learn: 0.4011377	test: 0.3891307	best: 0.3890908 (1475)	total: 7.64s	remaining: 2.21s
1551:	learn: 0.4011348	test: 0.3890861	best: 0.3890861 (1551)	total: 7.64s	remaining: 2.21s
1552:	learn: 0.4011321	test: 0.3891114	best: 0.3890861 (1551)	total: 7.65s	remaining: 2.2s
1553:	learn: 0.4011289	test: 0.3889970	best: 0.3889970 (1553)	total: 7.65s	remaining: 2.2s
1554:	learn: 0.4011252	test: 0.3891974	best: 0.3889970 (1553)	total: 7.66s	remaining: 2.19s
1555:	learn: 0.4011235	test: 0.3892134	best: 0.3889970 (1553)	total: 7.66s	remaining: 2.19s
1556:	learn: 0.4011210	test: 0.3891351	best: 0.3889970 (1553)	total: 7.67s	remaini

1652:	learn: 0.4008313	test: 0.3883992	best: 0.3883992 (1652)	total: 8.17s	remaining: 1.71s
1653:	learn: 0.4008286	test: 0.3884833	best: 0.3883992 (1652)	total: 8.17s	remaining: 1.71s
1654:	learn: 0.4008263	test: 0.3884933	best: 0.3883992 (1652)	total: 8.18s	remaining: 1.7s
1655:	learn: 0.4008241	test: 0.3884859	best: 0.3883992 (1652)	total: 8.18s	remaining: 1.7s
1656:	learn: 0.4008204	test: 0.3883991	best: 0.3883991 (1656)	total: 8.19s	remaining: 1.69s
1657:	learn: 0.4008163	test: 0.3884926	best: 0.3883991 (1656)	total: 8.19s	remaining: 1.69s
1658:	learn: 0.4008139	test: 0.3885276	best: 0.3883991 (1656)	total: 8.2s	remaining: 1.68s
1659:	learn: 0.4008111	test: 0.3884138	best: 0.3883991 (1656)	total: 8.2s	remaining: 1.68s
1660:	learn: 0.4008078	test: 0.3883907	best: 0.3883907 (1660)	total: 8.21s	remaining: 1.67s
1661:	learn: 0.4008046	test: 0.3884018	best: 0.3883907 (1660)	total: 8.21s	remaining: 1.67s
1662:	learn: 0.4008009	test: 0.3883788	best: 0.3883788 (1662)	total: 8.21s	remaining

1764:	learn: 0.4005137	test: 0.3889335	best: 0.3883788 (1662)	total: 8.71s	remaining: 1.16s
1765:	learn: 0.4005109	test: 0.3888873	best: 0.3883788 (1662)	total: 8.71s	remaining: 1.15s
1766:	learn: 0.4005079	test: 0.3887796	best: 0.3883788 (1662)	total: 8.72s	remaining: 1.15s
1767:	learn: 0.4005039	test: 0.3888858	best: 0.3883788 (1662)	total: 8.73s	remaining: 1.15s
1768:	learn: 0.4005005	test: 0.3889248	best: 0.3883788 (1662)	total: 8.73s	remaining: 1.14s
1769:	learn: 0.4004972	test: 0.3888785	best: 0.3883788 (1662)	total: 8.74s	remaining: 1.14s
1770:	learn: 0.4004948	test: 0.3889618	best: 0.3883788 (1662)	total: 8.74s	remaining: 1.13s
1771:	learn: 0.4004912	test: 0.3888991	best: 0.3883788 (1662)	total: 8.75s	remaining: 1.13s
1772:	learn: 0.4004886	test: 0.3890507	best: 0.3883788 (1662)	total: 8.75s	remaining: 1.12s
1773:	learn: 0.4004861	test: 0.3890243	best: 0.3883788 (1662)	total: 8.76s	remaining: 1.11s
1774:	learn: 0.4004824	test: 0.3891856	best: 0.3883788 (1662)	total: 8.76s	remai

1873:	learn: 0.4002370	test: 0.3889072	best: 0.3883788 (1662)	total: 9.26s	remaining: 622ms
1874:	learn: 0.4002343	test: 0.3889930	best: 0.3883788 (1662)	total: 9.26s	remaining: 618ms
1875:	learn: 0.4002315	test: 0.3889914	best: 0.3883788 (1662)	total: 9.27s	remaining: 613ms
1876:	learn: 0.4002287	test: 0.3888379	best: 0.3883788 (1662)	total: 9.27s	remaining: 608ms
1877:	learn: 0.4002265	test: 0.3888480	best: 0.3883788 (1662)	total: 9.28s	remaining: 603ms
1878:	learn: 0.4002251	test: 0.3888605	best: 0.3883788 (1662)	total: 9.28s	remaining: 598ms
1879:	learn: 0.4002233	test: 0.3889373	best: 0.3883788 (1662)	total: 9.29s	remaining: 593ms
1880:	learn: 0.4002212	test: 0.3890032	best: 0.3883788 (1662)	total: 9.29s	remaining: 588ms
1881:	learn: 0.4002201	test: 0.3889905	best: 0.3883788 (1662)	total: 9.3s	remaining: 583ms
1882:	learn: 0.4002184	test: 0.3890001	best: 0.3883788 (1662)	total: 9.3s	remaining: 578ms
1883:	learn: 0.4002164	test: 0.3891387	best: 0.3883788 (1662)	total: 9.31s	remaini

1984:	learn: 0.3999470	test: 0.3885070	best: 0.3883788 (1662)	total: 9.8s	remaining: 74.1ms
1985:	learn: 0.3999446	test: 0.3884882	best: 0.3883788 (1662)	total: 9.81s	remaining: 69.1ms
1986:	learn: 0.3999410	test: 0.3885460	best: 0.3883788 (1662)	total: 9.81s	remaining: 64.2ms
1987:	learn: 0.3999380	test: 0.3884530	best: 0.3883788 (1662)	total: 9.82s	remaining: 59.3ms
1988:	learn: 0.3999344	test: 0.3884733	best: 0.3883788 (1662)	total: 9.82s	remaining: 54.3ms
1989:	learn: 0.3999318	test: 0.3883885	best: 0.3883788 (1662)	total: 9.83s	remaining: 49.4ms
1990:	learn: 0.3999291	test: 0.3884661	best: 0.3883788 (1662)	total: 9.83s	remaining: 44.5ms
1991:	learn: 0.3999269	test: 0.3884297	best: 0.3883788 (1662)	total: 9.84s	remaining: 39.5ms
1992:	learn: 0.3999253	test: 0.3882972	best: 0.3882972 (1992)	total: 9.84s	remaining: 34.6ms
1993:	learn: 0.3999231	test: 0.3883306	best: 0.3882972 (1992)	total: 9.85s	remaining: 29.6ms
1994:	learn: 0.3999211	test: 0.3884300	best: 0.3882972 (1992)	total: 9.

In [None]:
grid = {'learning_rate': [0.03, 0.1, 1],
        'depth': [1, 2, 4, 6, 10],
        'l2_leaf_reg': [1, 3, 5],
        'iterations': [50, 100, 150, 500, 1000, 2000]}

model.grid_search(grid,train_dataset, verbose=False)

In [None]:
model.get_params()

In [31]:
from xgboost import XGBClassifier

xgb_clf_params = {'colsample_bytree': 0.6,
                  'gamma': 0.5,
                  'learning_rate': 0.1,
                  'max_depth': 3,
                  'min_child_weight': 5,
                  'n_estimators': 1000,
                  'subsample': 1.0,
                'tree_method':'gpu_hist',
                  'gpu_id':0, 
                  'verbosity': 0, 
                  'objective':'binary:logistic', 
                  'silent':True}

clf = XGBClassifier(**xgb_clf_params)

fit(X_train_split, y_train_split, clf)



In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier()
fit(X_train_split, y_train_split, clf)

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier

clf = AdaBoostClassifier(n_estimators=1000)
fit(X_train_split, y_train_split, clf)

In [None]:
X_train_pre.loc[~(X_train_pre['f18'] == X_train_pre['f26'])]

In [None]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

from catboost import CatBoostClassifier
models['Catboost'] = CatBoostClassifier(iterations=2000, depth=1, learning_rate=1, verbose=False)

from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

from lightgbm import LGBMClassifier
models['Light GBM'] = LGBMClassifier()

from sklearn.ensemble import GradientBoostingClassifier
models['GBM'] = GradientBoostingClassifier()

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall, auc = {}, {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train_split, y_train_split)
    
    # Prediction 
    predictions = models[key].predict_proba(X_test_split )
    
    # Calculate AUC
    auc[key] = roc_auc_score(y_test_split, predictions[:, 1])
    

df_model = pd.DataFrame(index=models.keys(), columns=['Auc'])
df_model['Auc'] = auc.values()

df_model

In [7]:
%%time

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

models = {}
# models['rf'] = RandomForestClassifier(random_state=42)
# models['Catboost'] = CatBoostClassifier(boosting_type='Plain', gpu_cat_features_storage = 'CpuPinnedMemory', max_ctr_complexity=1, iterations=1000, depth=1, learning_rate=1, verbose=False, random_state=42, task_type="GPU", devices='0:1')
models['lgbm'] = LGBMClassifier(random_state=42, device='gpu')
# models['gbm'] = GradientBoostingClassifier(min_samples_split=500,min_samples_leaf=50,max_depth=8, subsample=0.8, random_state=42)
# models['xgb'] = XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity = 0)
clf = StackingClassifier(estimators = list(models.items()), final_estimator=LogisticRegression(), cv=10)

params = {# 'rf__n_estimators': [5, 10, 100], 
          'lgbm__max_depth': [6,7], 
          'lgbm__num_leaves': [70, 80], 
          # 'gbm__n_estimators': range(20,81,10), 
          # 'gbm__learning_rate': [1, .1, .01]
 #            'xgb__max_depth': [5, 10, 20],
 #            'xgb__n_estimators': [10, 100, 1000],
 #            'xgb__learning_rate': [1, .1, .01]
        }

# grid = GridSearchCV(estimator=clf, param_grid=params, cv=5, n_jobs=-1, scoring='roc_auc', refit=True, verbose=100)
grid = RandomizedSearchCV(estimator=clf, param_distributions=params, n_iter=10, cv=5, n_jobs=-1, scoring='roc_auc', refit=True, verbose=10)
grid.fit(X_train_pre, y_train)



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Wall time: 3min 35s


RandomizedSearchCV(cv=5,
                   estimator=StackingClassifier(cv=10,
                                                estimators=[('lgbm',
                                                             LGBMClassifier(device='gpu',
                                                                            random_state=42))],
                                                final_estimator=LogisticRegression()),
                   n_jobs=-1,
                   param_distributions={'lgbm__max_depth': [6, 7],
                                        'lgbm__num_leaves': [70, 80]},
                   scoring='roc_auc', verbose=10)

In [7]:
%%time


from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


models = {}
models['rf'] = RandomForestClassifier(random_state=42)
models['Catboost'] = CatBoostClassifier(boosting_type='Plain', gpu_cat_features_storage = 'CpuPinnedMemory', max_ctr_complexity=1, iterations=1000, depth=1, learning_rate=1, verbose=False, random_state=42, task_type="GPU", devices='0:1')
models['lgbm'] = LGBMClassifier(random_state=42, device='gpu')
models['gbm'] = GradientBoostingClassifier(min_samples_split=500,min_samples_leaf=50,max_depth=8, subsample=0.8, random_state=42)
models['xgb'] = XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity = 0, objective='binary:logistic', silent=True)

# clf = StackingClassifier(estimators = list(models.items()), final_estimator=LogisticRegression(), cv=10)




Wall time: 161 ms


In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def grid(estimator, params, n_jobs=-1):
    grid_cv = GridSearchCV(estimator=estimator, param_grid=params, cv=5, n_jobs=n_jobs, scoring='roc_auc', refit=True, verbose=10)
    grid_cv.fit(X_train_pre, y_train)
    return grid_cv

In [11]:
%%time
lgbm_params = {
        'learning_rate': [1],
        'n_estimators': [24, 32, 52],
        'num_leaves': [16], # large num_leaves helps improve accuracy but might lead to over-fitting
        'boosting_type' : ['dart'], # for better accuracy -> try dart
        'objective' : ['binary'],
        'max_bin':[255], # large max_bin helps improve accuracy but might slow down training progress
        'colsample_bytree' : [0.64],
        'subsample' : [0.7],
}

lgbm_grid = grid(models['lgbm'], lgbm_params)
print(f'Best params lgbm: {lgbm_grid.best_params_}')
print(f'Best score lgbm: {lgbm_grid.best_score_}')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best params lgbm: {'boosting_type': 'dart', 'colsample_bytree': 0.64, 'learning_rate': 1, 'max_bin': 255, 'n_estimators': 32, 'num_leaves': 16, 'objective': 'binary', 'subsample': 0.7}
Best score lgbm: 0.7322889897844772
Wall time: 3.15 s


In [19]:
%%time
cat_params = {'iterations': [500, 1000, 2000],
              'depth': [1, 4, 5, 6],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'l2_leaf_reg': np.logspace(-20, -19, 3),
              'leaf_estimation_iterations': [10],
}

cat_grid = grid(models['Catboost'], cat_params, 2)
print(f'Best params lgbm: {cat_grid.best_params_}')
print(f'Best score lgbm: {cat_grid.best_score_}')

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params lgbm: {'depth': 1, 'iterations': 2000, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'loss_function': 'CrossEntropy'}
Best score lgbm: 0.7598584828689188
Wall time: 1h 19min 39s


In [None]:
%%time
xgb_params = {'min_child_weight': [1, 5, 10],
                'gamma': [0.5, 1, 1.5, 2, 5],
                'subsample': [0.6, 0.8, 1.0],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [3, 4, 5],
              'n_estimators': [10, 100, 500, 1000],
              'learning_rate': [1, .1, .01],
}

xgb_grid = grid(models['xgb'], xgb_params)
print(f'Best params xgb: {xgb_grid.best_params_}')
print(f'Best score xgb: {xgb_grid.best_score_}')

Fitting 5 folds for each of 4860 candidates, totalling 24300 fits


In [48]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

cat_clf_params = {
    'depth': 1, 
    'iterations': 2000, 
    'l2_leaf_reg': 1e-20, 
    'leaf_estimation_iterations': 10, 
    'loss_function': 'CrossEntropy',
    'boosting_type': 'Plain', 
    'gpu_cat_features_storage': 'CpuPinnedMemory',
    'max_ctr_complexity':1,
    'learning_rate':1, 
    'verbose':False, 
    'random_state':42, 
#     'task_type':"GPU", 
#     'devices':'0:1'
}

lgbm_clf_params = {
    'boosting_type': 'dart', 
    'colsample_bytree': 0.64, 
    'learning_rate': 1, 
    'max_bin': 255, 
    'n_estimators': 32, 
    'num_leaves': 16, 
    'objective': 'binary', 
    'subsample': 0.7,
    'random_state':42, 
    'device':'gpu'
}

xgb_clf_params = {
    'colsample_bytree': 0.6,
    'gamma': 0.5,
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_child_weight': 5,
    'n_estimators': 1000,
    'subsample': 1.0,
    'tree_method':'gpu_hist',
    'gpu_id':0, 
    'verbosity': 0, 
    'objective':'binary:logistic', 
    'silent':True
}

ada_clf_params = {
    'learning_rate': 1, 
    'n_estimators': 2000
}

rf_clf_params = {
    'criterion': 'entropy', 
    'max_depth': 8, 
    'max_features': 'log2', 
    'n_estimators': 500
}

stacking_models = {}

stacking_models['cat'] = CatBoostClassifier(**cat_clf_params)
stacking_models['lgbm'] = LGBMClassifier(**lgbm_clf_params)
stacking_models['xgb'] = XGBClassifier(**xgb_clf_params)
stacking_models['ada'] = AdaBoostClassifier(**ada_clf_params)
stacking_models['rf'] = RandomForestClassifier(**rf_clf_params)

clf = StackingClassifier(estimators = list(stacking_models.items()), final_estimator=LogisticRegression(), cv=5, n_jobs=-1, stack_method="predict_proba")
fit(X_train_pre, y_train, clf)

In [39]:
X_test_pre.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f9_1,f9_2
0,23,-1,44081267026,67744947326,7.0036,0,0.0,2,5,1,...,3,1,4,0,104,12951280000.0,14,1,18,49
1,2,1,65210554704,24545576187,6.952,0,0.0,2,4,1,...,1,0,4,0,20,12951280000.0,7,0,1,11
2,16,1,56770977603,29071567224,6.8457,0,0.0,2,1,0,...,2,0,5,0,28,8959571000.0,10,0,19,25
3,7,-1,67501891241,29509479615,6.9464,1,0.0,2,4,1,...,2,1,4,1,64,7502526000.0,0,1,0,6
4,2,-1,34496322090,34981763029,7.24,0,1.0,1,1,0,...,1,1,0,0,76,35832720000.0,3,1,17,49


In [40]:
X_test_split.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f9_1,f9_2
44119,23,0,43418571960,7477172309,7.3299,0,0.0,2,5,0,...,1,1,3,1,104,61755250000.0,2,1,3,8
49941,8,4,6950627633,27282898129,7.0915,0,0.0,2,4,0,...,1,1,4,0,5,37743910000.0,4,1,31,10
17540,1,-1,17903699520,4489187497,7.1645,0,0.0,2,5,0,...,2,0,5,0,48,4642168000.0,10,1,19,25
32253,20,0,56030960402,34887773206,6.9906,0,0.0,2,6,1,...,1,1,5,0,70,406334200.0,3,1,3,17
40134,2,3,22572516651,46871571312,6.8799,0,0.0,2,6,1,...,1,1,1,1,80,39347470000.0,1,1,20,18


In [41]:
X_train_pre.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f9_1,f9_2
0,8,0,5182688952,51071943155,6.8928,1,0.0,2,4,1,...,3,0,1,0,51,7833781000.0,12,1,15,18
1,13,-1,9436733442,15751265451,6.8098,0,1.0,1,6,10,...,3,1,5,0,10,60241470000.0,10,1,3,19
2,15,4,27081192841,38672968061,6.7761,0,0.0,2,5,0,...,3,0,2,1,16,24992580000.0,2,0,46,41
3,10,3,40278799087,59138528875,6.9302,0,0.0,2,5,1,...,2,0,3,0,63,47424520000.0,8,1,1,47
4,7,0,35290190025,64588882015,7.0238,0,1.0,1,5,0,...,2,1,0,1,26,1984312000.0,7,1,35,1


In [51]:
prediction = predict_proba(clf, X_test_split)

print_accuracy(prediction, y_test_split, 'stack')


real_pred = predict_proba(clf, X_test_pre)

f = open("./pred.csv", "w")
f.write("id,target\n")
id_nr = 50000
for v in real_pred[:, 1]:
    f.write(f"{id_nr},{v}\n")
    id_nr += 1
f.close()

stack Model accuracy score: 0.8149


In [None]:
from catboost import Pool, cv

cv_dataset = Pool(data=X_train_pre,
                  label=y_train,
                  cat_features=list(set(X_train_pre.columns).intersection(set(categorical_columns))))

params = {"iterations": 2000,
          "depth": 2,
          "learning_rate": 1,
          "loss_function": "Logloss",
          "verbose": False,
          "roc_file": "roc-file"}

scores = cv(cv_dataset,
            params,
            fold_count=2, 
            plot="True")

In [None]:
scores