In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')#.sample(frac=0.25,random_state=42)#,nrows=500000)
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')#,nrows=10000)

In [None]:
train['is_train'] = True
test['is_train'] = False
X= train.append(test).reset_index(drop = True)
del train, test

features = list(set(X.columns)-{'claim','id','is_train'})
X['n_missing'] = X[features].isna().sum(axis=1).astype('int')
X['n_missing_std'] = X[features].isna().std(axis=1).astype('float')
X['mean_orig'] = X[features].mean(axis=1)
X[features] = X[features].fillna(X[features].mean())
for el in ['f40','f70','f45','f47','f1','f28']:
    X[el+'log']= np.log(X[el].clip(lower=0)+1)

scaler = RobustScaler()
X[features] = scaler.fit_transform(X[features])

X['med'] = X[features].median(axis=1)
#X['max'] = X[features].max(axis=1)
X['max2'] = X[features].abs().max(axis=1)
X['min'] = X[features].min(axis=1)
#X['min2'] = X[features].abs().min(axis=1)
X['skew'] = X[features].skew(axis=1)
X['mean2'] = (X[features]**2).mean(axis=1)

features = list(set(X.columns)-{'claim','id','is_train','f85'})

In [None]:
y = X.loc[X.is_train,'claim']
test = X[~X.is_train]
X = X.loc[X.is_train,features]

In [None]:
from lightgbm import LGBMClassifier

final_predictions = []
valid_scores = []
imp = pd.DataFrame(index = X.columns)
    
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_idx, valid_idx) in enumerate(kf.split(X=X)):
    X_train = X.loc[train_idx]
    X_valid = X.loc[valid_idx]
    y_train = y.loc[train_idx]
    y_valid = y.loc[valid_idx]
    X_test = test[features].copy()
    
    scaler = StandardScaler()
    X_train= scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)
    
    model = LGBMClassifier(
        max_depth = 3,
        num_leaves = 7,
        n_estimators = 20000,
        colsample_bytree = 0.3,
        subsample = 0.5,
        random_state = 42,
        reg_alpha=18,
        reg_lambda=17,
        learning_rate = 0.095,
        device = 'gpu',
        objective= 'binary',        
    )
    
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = "auc",
             early_stopping_rounds = 400)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
    preds_test = model.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_valid, preds_valid)
    final_predictions.append(preds_test)
    valid_scores.append(score)
    print(f'Valid score for Fold {fold} : {score}')
    
    imp["Fold_"+str(fold)]=model.feature_importances_

imp["Fold_mean"] = imp.mean(axis=1)
imp=imp.sort_values('Fold_mean',ascending=False)
   
print('\nAverage valid score: ', np.mean(valid_scores))
print('\nFeature Importance\n')
imp.head(10)

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
preds = np.mean(np.column_stack(final_predictions), axis=1)
sub[sub.columns[1]] = preds
sub.to_csv("submission_mdl1.csv", index=False)
sub.describe()

# model 3

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from skimage.filters import threshold_otsu
import lightgbm as lgb
import gc

SEED = 0

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv", index_col='id')

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['abs_sum'] = train[features].abs().sum(axis=1)
test['abs_sum'] = test[features].abs().sum(axis=1)

train['sem'] = train[features].sem(axis=1)
test['sem'] = test[features].sem(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

train['avg'] = train[features].mean(axis=1)
test['avg'] = test[features].mean(axis=1)

train['max'] = train[features].max(axis=1)
test['max'] = test[features].min(axis=1)

train['min'] = train[features].min(axis=1)
test['min'] = test[features].min(axis=1)

In [None]:
X = train.drop(["claim"], axis=1)
X_test = test
y = train["claim"]

In [None]:
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
del test, train, scaler, imputer
gc.collect()

In [None]:
# Model hyperparameters
lgbm_params = {'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 6,
               'max_depth': 2,
               'n_estimators': 40000,
               'reg_alpha': 25.0,
               'reg_lambda': 76.7,
               'random_state': SEED,
               'bagging_seed': SEED, 
               'feature_fraction_seed': SEED,
               'n_jobs': -1,
               'subsample': 0.98,
               'subsample_freq': 1,
               'colsample_bytree': 0.69,
               'min_child_samples': 54,
               'min_child_weight': 256,
               'metric': 'AUC',
               'verbosity': -1,
              }

In [None]:
%%time

splits = 5
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=SEED)

preds = np.zeros(len(X_test))

for train_idx, valid_idx in kf.split(X, y):    
    lgb_train = lgb.Dataset(X[train_idx], y[train_idx], free_raw_data=False)
    lgb_valid = lgb.Dataset(X[valid_idx], y[valid_idx], free_raw_data=False)

    lgbm_params['learning_rate'] = 0.2
    
    model = lgb.train(lgbm_params,
                      lgb_train,
                      verbose_eval=-1,
                      early_stopping_rounds=300,
                      valid_sets=[lgb_valid])
    
    lgbm_params['learning_rate'] = 0.1
    
    model = lgb.train(lgbm_params,
                      lgb_train,
                      init_model=model,
                      verbose_eval=-1,
                      early_stopping_rounds=300,
                      valid_sets=[lgb_valid])
    
    preds += model.predict(X_test) / splits
    
    gc.collect()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')
submission['claim'] = preds
submission.to_csv('submission_mdl5.csv')

In [None]:
# %%time
# read dataframe
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

sample_submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
# prepare dataframe for modeling
X = df_train.drop(columns=['id','claim']).copy()
y = df_train['claim'].copy()

test_data = df_test.drop(columns=['id']).copy()

In [None]:
# feature Engineering
def get_stats_per_row(data):
    data['mv_row'] = data.isna().sum(axis=1)
    data['min_row'] = data.min(axis=1)
    data['std_row'] = data.std(axis=1)
    return data

X = get_stats_per_row(X)
test_data = get_stats_per_row(test_data)

In [None]:
# create preprocessing pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test_data = pd.DataFrame(columns=test_data.columns, data=pipeline.transform(test_data))

In [None]:
# params from optuna study, i've done earlier
best_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

In [None]:
%%time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from catboost import CatBoostClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=1)

pred_tmp = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = CatBoostClassifier(**best_params)
    model.fit(X_train, y_train)

    # validation prediction
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('::'*20)
    
    # test prediction
    y_hat = model.predict_proba(test_data)[:,1]
    pred_tmp.append(y_hat)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
# average predictions over all folds
predictions = np.mean(np.column_stack(pred_tmp),axis=1)

# create submission file
sample_submission['claim'] = predictions
sample_submission.to_csv('./mdl_1212.csv', index=False)