In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import lightgbm as lgb
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.cluster import MiniBatchKMeans\



In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv", index_col='id')

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
# Counting amount of missing values in each row and adding it as a new feature
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

In [None]:
X = train.drop(["claim"], axis=1)
X_test = test
y = train["claim"]

In [None]:
del test, train
gc.collect()

In [None]:
# Scaling all values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
# Model hyperparameters
lgbm_params = {'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 6,
               'max_depth': 4,
               'learning_rate': 0.025,
               'n_estimators': 40000,
               'reg_alpha': 25.0,
               'reg_lambda': 76.7,
               'random_state': 0,
               'bagging_seed': 0, 
               'feature_fraction_seed': 0,
               'n_jobs': -1,
               'subsample': 0.98,
               'subsample_freq': 1,
               'colsample_bytree': 0.69,
               'min_child_samples': 54,
               'min_child_weight': 256,
               'device': 'gpu',
               'metric': 'AUC',
               'verbosity': -1}

In [None]:
%%time

splits = 3
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

oof_preds = np.zeros(len(X))
preds = np.zeros(len(X_test))
total_mean_auc = 0

for num, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    
    model = lgb.train(lgbm_params,
                      lgb_train,
                      verbose_eval=1000,
                      early_stopping_rounds=100,
                      valid_sets=[lgb_valid])
    preds += model.predict(X_test) / splits
    
    #oof_preds[valid_idx] = model.predict(X_valid)
    #fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    #print(f"Fold {num} ROC AUC: {fold_auc}")
    #total_mean_auc += fold_auc / splits
#print(f"\nOverall ROC AUC: {total_mean_auc}")

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')
submission['claim'] = preds
submission.to_csv('submission_1.csv')