# TPS August 2021

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')#.sample(frac=0.25,random_state=42)#,nrows=500000)
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')#,nrows=10000)


# Feature Engineering and Missing Values
Building on [TPS Sep 2021 single LGBM](https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm/notebook) by [@hiro5299834](https://www.kaggle.com/hiro5299834)


In [None]:
train['is_train'] = True
test['is_train'] = False
X= train.append(test).reset_index(drop = True)
del train, test

features = list(set(X.columns)-{'claim','id','is_train'})
X['n_missing'] = X[features].isna().sum(axis=1).astype('int')
X['n_missing_std'] = X[features].isna().std(axis=1).astype('float')
X['mean_orig'] = X[features].mean(axis=1)
X[features] = X[features].fillna(X[features].mean())
for el in ['f40','f70','f45','f47','f1','f28','f13','f42','f65']:
    X[el+'log']= np.log(X[el].clip(lower=0)+1)

scaler = RobustScaler()
X[features] = scaler.fit_transform(X[features])

X['med'] = X[features].median(axis=1)
#X['max'] = X[features].max(axis=1)
X['max2'] = X[features].abs().max(axis=1)
X['min'] = X[features].min(axis=1)
#X['min2'] = X[features].abs().min(axis=1)
X['skew'] = X[features].skew(axis=1)
X['mean2'] = (X[features]**2).mean(axis=1)

features = list(set(X.columns)-{'claim','id','is_train','f85'})

In [None]:
X.describe()

# Training

LGBM classifier with starter parameters from https://www.kaggle.com/hsuchialun/tps-lightgbm-kfold.


In [None]:
y = X.loc[X.is_train,'claim']
test = X[~X.is_train]
X = X.loc[X.is_train,features]

In [None]:
from lightgbm import LGBMClassifier

final_predictions = []
valid_scores = []
imp = pd.DataFrame(index = X.columns)
    
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_idx, valid_idx) in enumerate(kf.split(X=X)):
    X_train = X.loc[train_idx]
    X_valid = X.loc[valid_idx]
    y_train = y.loc[train_idx]
    y_valid = y.loc[valid_idx]
    X_test = test[features].copy()
    
    scaler = StandardScaler()
    X_train= scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)
    
    model = LGBMClassifier(
        max_depth = 3,
        num_leaves = 7,
        n_estimators = 20000,
        colsample_bytree = 0.3,
        subsample = 0.5,
        random_state = 42,
        reg_alpha=18,
        reg_lambda=17,
        learning_rate = 0.095,
        device = 'gpu',
        objective= 'binary',        
    )
    
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = "auc",
             early_stopping_rounds = 400)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
    preds_test = model.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_valid, preds_valid)
    final_predictions.append(preds_test)
    valid_scores.append(score)
    print(f'Valid score for Fold {fold} : {score}')
    
    imp["Fold_"+str(fold)]=model.feature_importances_

imp["Fold_mean"] = imp.mean(axis=1)
imp=imp.sort_values('Fold_mean',ascending=False)
   
print('\nAverage valid score: ', np.mean(valid_scores))
print('\nFeature Importance\n')
imp.head(10)

# Submission 

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
preds = np.mean(np.column_stack(final_predictions), axis=1)
sub[sub.columns[1]] = preds
sub.to_csv("submission.csv", index=False)
sub.describe()

In [None]:
preds.mean()

In [None]:
sub