#BLENDING

In [1]:
input_path = '../input/'
output_path = './'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn import model_selection

def load_raw_data(folder_name,train_or_test="train"):
    file_name = f'{input_path}/{folder_name}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path +"/tabular-playground-series-apr-2022/"+ ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])


def submit(arr):
    df = pd.read_csv(f'{input_path}/tabular-playground-series-apr-2022/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)


def to_csv(arr,train_or_test='train',name=None):
    df = pd.DataFrame(arr)
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}.csv', index = False )



In [3]:
def short_test(x,y,n):
    return x.loc[x.sequence<n] ,y[:n]

In [4]:
df_NN_pure = load_raw_data("tbr-apr-22-nn-pure").to_numpy()
df_F_pure_wc_val_0 = load_raw_data("feature-wc-val-0").to_numpy()
df_F_double_woc = load_raw_data("tbr-apr-2022-feature-double-woc").to_numpy()
df_NN_pure_soft = pd.read_csv('../input/tbr-apr-22-nn-pure/soft_train.csv').to_numpy().reshape(-1,1)


In [5]:
df_test_NN_pure = load_raw_data("tbr-apr-22-nn-pure","test").to_numpy()
df_test_F_pure_wc_val_0 = load_raw_data("feature-wc-val-0","test").to_numpy()
df_test_F_double_woc = load_raw_data("tbr-apr-2022-feature-double-woc","test").to_numpy()
df_test_NN_pure_soft = pd.read_csv('../input/tbr-apr-22-nn-pure/soft_test.csv').to_numpy().reshape(-1,1)


In [6]:

y = load_label("train")
y_test = load_label('test')
yy = np.concatenate([y,y], axis=0)

In [7]:
valids = np.concatenate([
                          df_NN_pure,
                          df_F_pure_wc_val_0,
                          df_F_double_woc[:25968],
                          df_NN_pure_soft
                            ], axis=1)

In [8]:
tests= np.concatenate([
                          df_test_NN_pure,
                          df_test_F_pure_wc_val_0,
                          df_test_F_double_woc[:25968],
                          df_test_NN_pure_soft
                            ], axis=1)

In [9]:
from lightgbm import LGBMClassifier


In [10]:
import sklearn.metrics as metrics

In [11]:
for i in range(len(df_F_double_woc[0])):
    print(metrics.roc_auc_score(y,df_F_double_woc[:25968,i]))


0.9996699614877759
0.9996699614877759
0.9997152862468819
0.9833434505835795
0.9996705250072254


In [12]:
y_test.shape

(12218,)

In [13]:
n_splits = 5
K_fold = model_selection.KFold(n_splits=n_splits,shuffle =True, random_state=97)
test_preds_array = np.zeros(len(y_test))
valid_preds_array = np.zeros(len(y))
scores_valid = []
scores_train = []

for fold, (train_idx, valid_idx) in enumerate(K_fold.split(valids)):
    X_train , y_train = valids[train_idx, :], y[train_idx]
    X_valid, y_valid = valids[valid_idx, :], y[valid_idx]


    clf = LGBMClassifier(num_leaves=25,
                        objective="binary",
                        metric='auc',
                        subsample=0.7,
                        learning_rate=0.03,
                        n_estimators=10000,
                        n_jobs=-1)
    clf.fit(X_train,y_train,
            eval_set=[(X_valid,y_valid)],
            verbose = 100,
            early_stopping_rounds=100)

    valid_preds = clf.predict_proba(X_valid)[:, -1]
    train_preds = clf.predict_proba(X_train)[:, -1]
    test_preds = clf.predict_proba(tests)[:, -1]
    test_preds_array += test_preds / n_splits
    valid_preds_array[valid_idx] = valid_preds
    
    score_valid = metrics.roc_auc_score(y_valid, valid_preds)
    score_train = metrics.roc_auc_score(y_train, train_preds)
    
    scores_valid.append(score_valid)
    scores_train.append(score_train)



[100]	valid_0's auc: 0.999037




[100]	valid_0's auc: 1




[100]	valid_0's auc: 0.999416




[100]	valid_0's auc: 1




[100]	valid_0's auc: 0.999599


In [14]:
submit(test_preds_array)
