In [None]:
import gc
import numpy as np
import pandas as pd
import datatable as dt
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
%%time
# import data
df_train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
df_test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas()

sample_submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time
# compressing the dataframe
df_train = reduce_memory_usage(df_train)
df_test = reduce_memory_usage(df_test)

# # sampling for prototyping
# np.random.seed(2003)
# df_train = df_train.sample(100000)
# df_test = df_test.sample(100000)

In [None]:
# prepare dataframes for modeling
X = df_train.drop(columns=['id','target']).copy()
y = df_train['target'].copy()

test_data = df_test.drop(columns=['id']).copy()

del df_train
del df_test
gc.collect()

In [None]:
from sklearn.metrics import roc_curve, auc

def get_auc(y_true, y_hat):
    fpr, tpr, _ = roc_curve(y_true, y_hat)
    score = auc(fpr, tpr)
    return score

In [None]:
disc_feat = [col for col in X.columns if X[col].dtype == 'bool']

lgbm_params = {
    'objective': 'binary', 
    'device_type': 'gpu', 
    'n_estimators': 20000, 
    'learning_rate':  0.01, 
    'min_child_weight': 256,
    'min_child_samples': 20, 
    'reg_alpha': 10, 
    'reg_lambda': 0.1, 
    'subsample': 0.6, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.4,
    'categorical_feature': len(disc_feat)
}

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

seeds = np.random.randint(1000,size=3)

scores = {seed: list() for seed in seeds}
predictions = {seed: list() for seed in seeds}

for seed in seeds:
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    
    for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = LGBMClassifier(**lgbm_params)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid,y_valid)],
            early_stopping_rounds=200,
            verbose=0
        )

        # validation prediction
        valid_pred = model.predict_proba(X_valid)[:,1]
        score = get_auc(y_valid, valid_pred)
        scores[seed].append(score)

        print(f"Seed: {seed} || Fold: {fold + 1} || Score: {score}")
        print('--'*20)

        # test prediction
        y_hat = model.predict_proba(test_data)[:,1]
        predictions[seed].append(y_hat)
        gc.collect()
    
    print(f"Overall Validation Score Seed {seed}: {np.mean(scores[seed])}")
    print('::'*20)
    gc.collect()

In [None]:
# create dataframe from scores to check validation
df_scores = pd.DataFrame(
    {seed: scores[seed] for seed in scores.keys()}
)

df_scores

In [None]:
# create dataframe from predictions
df_predictions = pd.DataFrame(
    {seed: np.mean(np.column_stack(predictions[seed]), axis=1) 
    for seed in predictions.keys()}
)

# create simple average blend
df_predictions['simple_avg'] = df_predictions.mean(axis=1)

# create submission file
sample_submission['target'] = df_predictions['simple_avg']
sample_submission.to_csv('./random_seeds_blending_submission.csv', index=False)