#BLENDING

In [1]:
input_path = '../input/'
output_path = './'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn import model_selection

def load_raw_data(folder_name,train_or_test="train"):
    file_name = f'{input_path}/tbr_apr_2022_{folder_name}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path +"/tabular-playground-series-apr-2022/"+ ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])


def submit(arr):
    df = pd.read_csv(f'{input_path}/tabular-playground-series-apr-2022/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)


def to_csv(arr,train_or_test='train',name=None):
    df = pd.DataFrame(arr)
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}.csv', index = False )



In [3]:
def short_test(x,y,n):
    return x.loc[x.sequence<n] ,y[:n]

In [4]:
df_NN_pure = pd.read_csv("../input/tbr-apr-22-nn-pure/train.csv").to_numpy()
            


In [5]:
L_df_F_dart_v = []
for i in range(5):
    L_df_F_dart_v.append(pd.read_csv(f"../input/tbr-apr-2022-dart-non-nn/tbr_apr_2022_dart_non_nn/train_v{i}.csv").to_numpy()/5)
df_F_dart_v = np.zeros(L_df_F_dart_v[0].shape)
for j in L_df_F_dart_v:
    df_F_dart_v+=j

df_test_NN_pure = pd.read_csv("../input/tbr-apr-22-nn-pure/test.csv").to_numpy()

L_df_test_F_dart_v = []
for i in range(5):
    L_df_test_F_dart_v.append(pd.read_csv(f"../input/tbr-apr-2022-dart-non-nn/tbr_apr_2022_dart_non_nn/test_v{i}.csv").to_numpy()/5)
df_test_F_dart_v = np.zeros(L_df_test_F_dart_v[0].shape)
for j in L_df_test_F_dart_v:
    df_test_F_dart_v+=j

In [6]:

y = load_label("train")
y_test = load_label('test')
yy = np.concatenate([y,y], axis=0)

In [7]:
valids = np.concatenate([
                          df_NN_pure,
                          df_F_dart_v
                            ], axis=1)

In [8]:
valids =valids[:,:-1]

In [9]:
tests= np.concatenate([
                          df_test_NN_pure,
                          df_test_F_dart_v
                            ], axis=1)

In [10]:
tests=tests[:,:-1]

In [11]:
from lightgbm import LGBMClassifier


In [12]:
import sklearn.metrics as metrics

In [13]:
for i in range(len(valids[0])):
    print(metrics.roc_auc_score(y,valids[:,i]))


0.9777898181122935
0.980460950723985
0.9613977609984228
0.9790575589205852
0.9791202875330113
0.9786032020859757
0.977540991650445
0.9669728457653909
0.9763716947241221
0.964018921298494
0.9660085601807571
0.9806624445179511
0.9933647245698443
0.9909257174830474
0.9893312421660414
0.9846474227265947


In [14]:
y_test.shape

(12218,)

In [15]:
n_splits = 5
K_fold = model_selection.KFold(n_splits=n_splits,shuffle =True, random_state=97)
test_preds_array = np.zeros(len(y_test))
valid_preds_array = np.zeros(len(y))
scores_valid = []
scores_train = []

for fold, (train_idx, valid_idx) in enumerate(K_fold.split(valids)):
    X_train , y_train = valids[train_idx, :], y[train_idx]
    X_valid, y_valid = valids[valid_idx, :], y[valid_idx]


    clf = LGBMClassifier(num_leaves=25,
                        objective="binary",
                        metric='auc',
                        subsample=0.7,
                        learning_rate=0.03,
                        n_estimators=10000,
                        n_jobs=-1)
    clf.fit(X_train,y_train,
            eval_set=[(X_valid,y_valid)],
            verbose = 100,
            early_stopping_rounds=100)

    valid_preds = clf.predict_proba(X_valid)[:, -1]
    train_preds = clf.predict_proba(X_train)[:, -1]
    test_preds = clf.predict_proba(tests)[:, -1]
    test_preds_array += test_preds / n_splits
    valid_preds_array[valid_idx] = valid_preds
    
    score_valid = metrics.roc_auc_score(y_valid, valid_preds)
    score_train = metrics.roc_auc_score(y_train, train_preds)
    
    scores_valid.append(score_valid)
    scores_train.append(score_train)



[100]	valid_0's auc: 0.99752
[200]	valid_0's auc: 0.998095
[300]	valid_0's auc: 0.998148
[400]	valid_0's auc: 0.998187
[500]	valid_0's auc: 0.998191
[600]	valid_0's auc: 0.998202




[100]	valid_0's auc: 0.997123
[200]	valid_0's auc: 0.998207
[300]	valid_0's auc: 0.998329
[400]	valid_0's auc: 0.998376




[100]	valid_0's auc: 0.997439
[200]	valid_0's auc: 0.998654




[100]	valid_0's auc: 0.99854
[200]	valid_0's auc: 0.99901
[300]	valid_0's auc: 0.999012




[100]	valid_0's auc: 0.998267
[200]	valid_0's auc: 0.998763
[300]	valid_0's auc: 0.998905
[400]	valid_0's auc: 0.998976
[500]	valid_0's auc: 0.999012
[600]	valid_0's auc: 0.99903
[700]	valid_0's auc: 0.999024


In [16]:
submit(test_preds_array)
