In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt
import seaborn as sns

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')

In [None]:
# prepare dataframe for modeling
X = train.drop(columns=['id','target']).copy()
y = train['target']

test_data = test.drop(columns=['id']).copy()

In [None]:
import random
import os

SEED = 12345

def seed_everything(seed=64):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# feature selection
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=f_classif, k='all')
# learn relationship from training data
    fs.fit(X_train, y_train)
# transform train input data
    X_train_fs = fs.transform(X_train)
# transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
for i in range(len(fs.scores_)):
    print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
plt.show()

In [None]:
#setting threhold of 100

X = X[['f1', 'f2', 'f3', 'f4', 'f6', 'f7', 'f8', 'f12', 'f13', 'f14',
       'f17', 'f18', 'f19', 'f22', 'f42', 'f43', 'f44', 'f48', 'f52', 'f53', 'f56',
       'f58', 'f63', 'f69', 'f72', 'f73', 'f74', 'f75', 'f77', 'f78',
       'f90', 'f92', 'f95', 'f96', 'f98', 'f99', 'f103', 'f112', 'f119',
       'f125', 'f127', 'f130', 'f134', 'f136', 'f138', 'f139', 'f141', 'f143', 'f150', 'f152',
       'f154', 'f156', 'f162', 'f169', 'f173', 'f179', 'f192', 'f195', 'f200', 'f201', 'f211',
       'f214', 'f227', 'f241', 'f243', 'f245', 'f247', 'f252', 'f258', 'f266']]

In [None]:
test = test_data[['f1', 'f2', 'f3', 'f4', 'f6', 'f7', 'f8', 'f12', 'f13', 'f14',
                  'f17', 'f18', 'f19', 'f22', 'f42', 'f43', 'f44', 'f48', 'f52', 'f53', 'f56',
                  'f58', 'f63', 'f69', 'f72', 'f73', 'f74', 'f75', 'f77', 'f78',
                   'f90', 'f92', 'f95', 'f96', 'f98', 'f99', 'f103', 'f112', 'f119',
                   'f125', 'f127', 'f130', 'f134', 'f136', 'f138', 'f139', 'f141', 'f143', 'f150', 'f152',
                   'f154', 'f156', 'f162', 'f169', 'f173', 'f179', 'f192', 'f195', 'f200', 'f201', 'f211',
                   'f214', 'f227', 'f241', 'f243', 'f245', 'f247', 'f252', 'f258', 'f266']]

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
params = {
    'max_depth': 3, 
    'colsample_bytree': 0.3,  
    'subsample': 0.5, 
    'reg_alpha': 18, 
    'reg_lambda': 17,
    'num_leaves': 7,
    'objective' : 'binary',
    'importance_type': 'gain',
}

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

kf = KFold(n_splits = 7, shuffle = True, random_state = SEED)
skf = StratifiedKFold(n_splits = 7, shuffle = True, random_state = SEED)

In [None]:
oof_lgb = np.zeros(len(X))
predictions_lgb = np.zeros(len(test))
i = 1

for train_ix, test_ix in kf.split(X.values):
        
    print("Out of fold predictions generating for fold  {} \n".format(i))
        
    train_X, train_y = X.values[train_ix], y.values[train_ix]
    test_X, test_y = X.values[test_ix], y.values[test_ix]
    
    model_lgb = LGBMClassifier(
        random_state = SEED,
        n_estimators = 20000,
        learning_rate = 0.095,
   #     device = 'gpu',
        **params
    )   
          
    model_lgb.fit(
        train_X, 
        train_y,
        eval_set = [(test_X, test_y)],
        eval_metric = "auc",
        early_stopping_rounds = 300,
        verbose = 100,
    )
    
    oof_lgb[test_ix] = oof_lgb[test_ix] + model_lgb.predict_proba(test_X)[:, 1]
    predictions_lgb = predictions_lgb + model_lgb.predict_proba(test)[:, 1]
    
    print("AUC for fold {} \t\t {} \n".format(i, round(roc_auc_score(test_y, oof_lgb[test_ix]), 7)))
    
    i = i + 1
    
print("AUC for Training Set: \t\t {} \n".format(round(roc_auc_score(y, oof_lgb), 7)))

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
submission['target'] = predictions_lgb / 7
submission.to_csv("submission.csv", index = False)
submission.head(10)