# How to get a good score?  TPS September 2021 (competition)
### Following the next steps 

In [None]:
import numpy as np 
import pandas as pd 

raw_train =  pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
raw_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

# Making a copy of raw data
train = raw_train.copy()
test = raw_test.copy()

# Shape
print("train shape :" , train.shape)
print("test shape :" , test.shape)

# Split target and remove de id column  in both dataset
target = raw_train.claim
train.drop(['id','claim'], axis = 1, inplace = True)
test.drop('id', axis = 1, inplace = True)

### Features Engineering and Cleaning

In [None]:
from sklearn.impute import SimpleImputer

features = train.columns

train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

si = SimpleImputer(strategy ='mean')
train[features] = si.fit_transform(train[features])
test[features] = si.transform(test[features])

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

params = {   
               'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 6,
               'max_depth': 2,
               'learning_rate': 0.1,
               'n_estimators': 40000,
               'reg_alpha': 25.0,
               'reg_lambda': 76.7,
               'bagging_seed': 42, 
               'feature_fraction_seed': 42,
               'n_jobs': 4,
               'subsample': 0.98,
               'subsample_freq': 1,
               'colsample_bytree': 0.69,
               'min_child_samples': 54,
               'min_child_weight': 256,
}

preds_valid_f = {}
preds_test = []
total_auc = []

kf = KFold(n_splits=5,random_state=0,shuffle=True)

for fold,(train_index, valid_index) in enumerate(kf.split(train,target)):

    X_train,X_valid = train.loc[train_index], train.loc[valid_index]
    y_train,y_valid = target.loc[train_index], target.loc[valid_index]
    
    # Preprocessing
    index_valid  = X_valid.index.tolist()
    #--------------------------------------------------------
    # model
    model = lgb.LGBMClassifier(**params,random_state = 0)
    model.fit(X_train, y_train,
              verbose=False,
              # These three parameters will stop training before a model starts overfitting 
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=300,
              )
    # oof
    preds_valid = model.predict_proba(X_valid)[:,1]
    #--------------------------------------------------------
    preds_test.append(model.predict_proba(test)[:,1])
    #--------------------------------------------------------
    preds_valid_f.update(dict(zip(index_valid, preds_valid)))

    # Getting score for a fold model
    fold_auc = roc_auc_score(y_valid, preds_valid)
    print(f"Fold {fold} roc_auc_score: {fold_auc}")
    # Total rmse
    total_auc.append(fold_auc)
    
print(f"mean roc_auc_score: {np.mean(total_auc)}, std: {np.std(total_auc)}")

In [None]:
output = pd.DataFrame({'id': raw_test.id,
                       'claim': np.mean(preds_test, axis = 0)})
output.to_csv('preds.csv', index=False)
output.head()