#BLENDING

In [1]:
input_path = '../input/'
output_path = './'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn import model_selection

def load_raw_data(folder_name,train_or_test="train"):
    file_name = f'{input_path}/tbr_apr_2022_{folder_name}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path +"/tabular-playground-series-apr-2022/"+ ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])


def submit(arr):
    df = pd.read_csv(f'{input_path}/tabular-playground-series-apr-2022/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)


def to_csv(arr,train_or_test='train',name=None):
    df = pd.DataFrame(arr)
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}.csv', index = False )



In [3]:
def short_test(x,y,n):
    return x.loc[x.sequence<n] ,y[:n]

In [4]:
df_NN_pure = pd.read_csv("../input/tbr-apr-22-nn-pure/train.csv").to_numpy()
print(df_NN_pure.shape)
j=len(df_NN_pure[:,0])
j=j/2
df_NN_pure = df_NN_pure[:int(j),:]
print(df_NN_pure.shape)


(51936, 12)
(25968, 12)


In [5]:
L_df_F_dart_v = []
for i in range(5):
    L_df_F_dart_v.append(pd.read_csv(f"../input/tbr-apr-2022-dart-non-nn/tbr_apr_2022_dart_non_nn/train_v{i}.csv").to_numpy()/5)
df_F_dart_v = np.zeros(L_df_F_dart_v[0].shape)
for j in L_df_F_dart_v:
    df_F_dart_v+=j

df_test_NN_pure = pd.read_csv("../input/tbr-apr-22-nn-pure/test.csv").to_numpy()

L_df_test_F_dart_v = []
for i in range(5):
    L_df_test_F_dart_v.append(pd.read_csv(f"../input/tbr-apr-2022-dart-non-nn/tbr_apr_2022_dart_non_nn/test_v{i}.csv").to_numpy()/5)
df_test_F_dart_v = np.zeros(L_df_test_F_dart_v[0].shape)
for j in L_df_test_F_dart_v:
    df_test_F_dart_v+=j

In [6]:

y = load_label("train")
y_test = load_label('test')
yy = np.concatenate([y,y], axis=0)

In [7]:
valids = np.concatenate([
                          df_NN_pure,
                          df_F_dart_v
                            ], axis=1)

In [8]:
valids =valids[:,:-1]

In [9]:
tests= np.concatenate([
                          df_test_NN_pure,
                          df_test_F_dart_v
                            ], axis=1)

In [10]:
tests=tests[:,:-1]

In [11]:
from lightgbm import LGBMClassifier


In [12]:
import sklearn.metrics as metrics

In [13]:
for i in range(len(valids[0])):
    print(metrics.roc_auc_score(y,valids[:,i]))


0.9861972317124829
0.9838347564987376
0.9835368089362274
0.9799339473346349
0.9836090106071917
0.9845959289124604
0.9783735352854169
0.9784392831757365
0.9814823890443846
0.9756179548353516
0.9777146268223537
0.9793530922471373
0.9933647245698443
0.9909257174830474
0.9893312421660414
0.9846474227265947


In [14]:
y_test.shape

(12218,)

In [15]:
n_splits = 5
K_fold = model_selection.KFold(n_splits=n_splits,shuffle =True, random_state=97)
test_preds_array = np.zeros(len(y_test))
valid_preds_array = np.zeros(len(y))
scores_valid = []
scores_train = []

for fold, (train_idx, valid_idx) in enumerate(K_fold.split(valids)):
    X_train , y_train = valids[train_idx, :], y[train_idx]
    X_valid, y_valid = valids[valid_idx, :], y[valid_idx]


    clf = LGBMClassifier(num_leaves=25,
                        objective="binary",
                        metric='auc',
                        subsample=0.7,
                        learning_rate=0.03,
                        n_estimators=10000,
                        n_jobs=-1,
                        boosting_type="dart")
    clf.fit(X_train,y_train,
            eval_set=[(X_valid,y_valid)],
            verbose = 100,
            early_stopping_rounds=100)

    valid_preds = clf.predict_proba(X_valid)[:, -1]
    train_preds = clf.predict_proba(X_train)[:, -1]
    test_preds = clf.predict_proba(tests)[:, -1]
    test_preds_array += test_preds / n_splits
    valid_preds_array[valid_idx] = valid_preds
    
    score_valid = metrics.roc_auc_score(y_valid, valid_preds)
    score_train = metrics.roc_auc_score(y_train, train_preds)
    
    scores_valid.append(score_valid)
    scores_train.append(score_train)



[100]	valid_0's auc: 0.995464
[200]	valid_0's auc: 0.996727
[300]	valid_0's auc: 0.9973
[400]	valid_0's auc: 0.997616
[500]	valid_0's auc: 0.99778
[600]	valid_0's auc: 0.997949
[700]	valid_0's auc: 0.998088
[800]	valid_0's auc: 0.998184
[900]	valid_0's auc: 0.998217
[1000]	valid_0's auc: 0.998262
[1100]	valid_0's auc: 0.998293
[1200]	valid_0's auc: 0.998289
[1300]	valid_0's auc: 0.998308
[1400]	valid_0's auc: 0.998305
[1500]	valid_0's auc: 0.998302
[1600]	valid_0's auc: 0.998301
[1700]	valid_0's auc: 0.998264
[1800]	valid_0's auc: 0.998244
[1900]	valid_0's auc: 0.998238
[2000]	valid_0's auc: 0.998228
[2100]	valid_0's auc: 0.998202
[2200]	valid_0's auc: 0.998171
[2300]	valid_0's auc: 0.998171
[2400]	valid_0's auc: 0.998142
[2500]	valid_0's auc: 0.998142
[2600]	valid_0's auc: 0.99811
[2700]	valid_0's auc: 0.998082
[2800]	valid_0's auc: 0.998064
[2900]	valid_0's auc: 0.998058
[3000]	valid_0's auc: 0.998053
[3100]	valid_0's auc: 0.998056
[3200]	valid_0's auc: 0.99804
[3300]	valid_0's auc: 



[100]	valid_0's auc: 0.996191
[200]	valid_0's auc: 0.996419
[300]	valid_0's auc: 0.99754
[400]	valid_0's auc: 0.997795
[500]	valid_0's auc: 0.998144
[600]	valid_0's auc: 0.998209
[700]	valid_0's auc: 0.99822
[800]	valid_0's auc: 0.998281
[900]	valid_0's auc: 0.998285
[1000]	valid_0's auc: 0.998263
[1100]	valid_0's auc: 0.998265
[1200]	valid_0's auc: 0.998279
[1300]	valid_0's auc: 0.99829
[1400]	valid_0's auc: 0.99829
[1500]	valid_0's auc: 0.998301
[1600]	valid_0's auc: 0.998305
[1700]	valid_0's auc: 0.998305
[1800]	valid_0's auc: 0.998306
[1900]	valid_0's auc: 0.998303
[2000]	valid_0's auc: 0.998305
[2100]	valid_0's auc: 0.998325
[2200]	valid_0's auc: 0.998349
[2300]	valid_0's auc: 0.998347
[2400]	valid_0's auc: 0.998339
[2500]	valid_0's auc: 0.998327
[2600]	valid_0's auc: 0.99833
[2700]	valid_0's auc: 0.998345
[2800]	valid_0's auc: 0.998353
[2900]	valid_0's auc: 0.998345
[3000]	valid_0's auc: 0.998353
[3100]	valid_0's auc: 0.998349
[3200]	valid_0's auc: 0.998341
[3300]	valid_0's auc: 



[100]	valid_0's auc: 0.996163
[200]	valid_0's auc: 0.997179
[300]	valid_0's auc: 0.997503
[400]	valid_0's auc: 0.998477
[500]	valid_0's auc: 0.998466
[600]	valid_0's auc: 0.998521
[700]	valid_0's auc: 0.998564
[800]	valid_0's auc: 0.998594
[900]	valid_0's auc: 0.998612
[1000]	valid_0's auc: 0.998618
[1100]	valid_0's auc: 0.998615
[1200]	valid_0's auc: 0.998616
[1300]	valid_0's auc: 0.998605
[1400]	valid_0's auc: 0.998578
[1500]	valid_0's auc: 0.998562
[1600]	valid_0's auc: 0.99856
[1700]	valid_0's auc: 0.99855
[1800]	valid_0's auc: 0.998531
[1900]	valid_0's auc: 0.998525
[2000]	valid_0's auc: 0.998517
[2100]	valid_0's auc: 0.998508
[2200]	valid_0's auc: 0.99851
[2300]	valid_0's auc: 0.998489
[2400]	valid_0's auc: 0.998472
[2500]	valid_0's auc: 0.998472
[2600]	valid_0's auc: 0.998482
[2700]	valid_0's auc: 0.99848
[2800]	valid_0's auc: 0.998476
[2900]	valid_0's auc: 0.998459
[3000]	valid_0's auc: 0.998442
[3100]	valid_0's auc: 0.998426
[3200]	valid_0's auc: 0.998416
[3300]	valid_0's auc:



[100]	valid_0's auc: 0.997479
[200]	valid_0's auc: 0.997778
[300]	valid_0's auc: 0.998667
[400]	valid_0's auc: 0.998875
[500]	valid_0's auc: 0.998991
[600]	valid_0's auc: 0.999017
[700]	valid_0's auc: 0.999033
[800]	valid_0's auc: 0.999025
[900]	valid_0's auc: 0.999026
[1000]	valid_0's auc: 0.999032
[1100]	valid_0's auc: 0.999034
[1200]	valid_0's auc: 0.999046
[1300]	valid_0's auc: 0.999044
[1400]	valid_0's auc: 0.999047
[1500]	valid_0's auc: 0.999041
[1600]	valid_0's auc: 0.999034
[1700]	valid_0's auc: 0.999034
[1800]	valid_0's auc: 0.99902
[1900]	valid_0's auc: 0.999004
[2000]	valid_0's auc: 0.999003
[2100]	valid_0's auc: 0.998995
[2200]	valid_0's auc: 0.998989
[2300]	valid_0's auc: 0.998981
[2400]	valid_0's auc: 0.998982
[2500]	valid_0's auc: 0.998979
[2600]	valid_0's auc: 0.99898
[2700]	valid_0's auc: 0.998988
[2800]	valid_0's auc: 0.99898
[2900]	valid_0's auc: 0.998978
[3000]	valid_0's auc: 0.99898
[3100]	valid_0's auc: 0.998975
[3200]	valid_0's auc: 0.998973
[3300]	valid_0's auc:



[100]	valid_0's auc: 0.997865
[200]	valid_0's auc: 0.998009
[300]	valid_0's auc: 0.99831
[400]	valid_0's auc: 0.9984
[500]	valid_0's auc: 0.998585
[600]	valid_0's auc: 0.99876
[700]	valid_0's auc: 0.998804
[800]	valid_0's auc: 0.998831
[900]	valid_0's auc: 0.99885
[1000]	valid_0's auc: 0.998849
[1100]	valid_0's auc: 0.998841
[1200]	valid_0's auc: 0.998839
[1300]	valid_0's auc: 0.99884
[1400]	valid_0's auc: 0.998843
[1500]	valid_0's auc: 0.99885
[1600]	valid_0's auc: 0.998854
[1700]	valid_0's auc: 0.998853
[1800]	valid_0's auc: 0.99884
[1900]	valid_0's auc: 0.998839
[2000]	valid_0's auc: 0.998843
[2100]	valid_0's auc: 0.998853
[2200]	valid_0's auc: 0.998858
[2300]	valid_0's auc: 0.998852
[2400]	valid_0's auc: 0.998857
[2500]	valid_0's auc: 0.998866
[2600]	valid_0's auc: 0.998866
[2700]	valid_0's auc: 0.998853
[2800]	valid_0's auc: 0.998858
[2900]	valid_0's auc: 0.998853
[3000]	valid_0's auc: 0.998846
[3100]	valid_0's auc: 0.998845
[3200]	valid_0's auc: 0.998848
[3300]	valid_0's auc: 0.9

In [16]:
submit(test_preds_array)
