In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
df_train_ogi = import_data("../input/tabular-playground-series-oct-2021/train.csv")
df_test_ogi = import_data("../input/tabular-playground-series-oct-2021/test.csv")   

In [None]:
sample = import_data("../input/tabular-playground-series-oct-2021/sample_submission.csv")       

In [None]:
df_train = df_train_ogi.copy()
df_test = df_test_ogi.copy()

In [None]:
features = [x for x in df_train.columns if 'f' in x]

In [None]:
df_train['mean'] = df_train[features].mean()
df_train['var'] = df_train[features].var()
df_train['std'] = df_train[features].std()

df_test['mean'] = df_test[features].mean()
df_test['var'] = df_test[features].var()
df_test['std'] = df_test[features].std()

In [None]:
X = df_train.drop(['id','target'], axis = 1)
y = df_train['target']
df_test = df_test.drop('id', axis = 1)

In [None]:
# xtrain ,xtest ,  ytrain , ytest = train_test_split(X , y , random_state = 0, stratify = y)

In [None]:
# def objective(trial,data=xtrain,target=ytrain):
    
#     param = {

#         'lambda': trial.suggest_uniform('lambda',0.001,0.1),
#         'alpha': trial.suggest_uniform('alpha',0.1,0.5),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1,1.0),
#         'subsample': trial.suggest_uniform('subsample', 0.5,0.9),
#         'learning_rate': trial.suggest_uniform('learning_rate', 0.05,0.10),
#         'n_estimators': trial.suggest_int('n_estimators', 10000,30000),
#         'max_depth': trial.suggest_int('max_depth', 3,8),
#         'min_child_weight': trial.suggest_int('min_child_weight', 10,100),        
#         'objective': trial.suggest_categorical('objective',['binary:logistic']), 
#         'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),  # 'gpu_hist','hist'
#         'eval_metric' : 'logloss'
#     }
#     model = xgb.XGBClassifier(**param)      
#     model.fit(xtrain,ytrain,eval_set=[(xtest,ytest)],early_stopping_rounds=100,verbose=False)
#     preds = model.predict(xtest)
#     auc = roc_auc_score(ytest, preds)
    
#     return auc

In [None]:
# import optuna
# from optuna.samplers import TPESampler
# import sklearn
# sampler = TPESampler()
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)
# params = study.best_params #getting best params from study
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
del df_train_ogi
del df_test_ogi

In [None]:
# from sklearn import model_selection
# from sklearn import metrics 
# params = {
#         "grow_policy": "lossguide",
#         'lambda': 0.022724869921522506, 
#         'alpha': 0.3985407522030936, 
#         'colsample_bytree': 0.5511604708167909, 
#         'subsample': 0.8133327192612618, 
#         'learning_rate': 0.05035773098806651, 
#         'n_estimators': 19522, 
#         'max_depth': 4, 
#         'min_child_weight': 69, 
#         'objective': 'binary:logistic', 
#         'tree_method': 'gpu_hist'
#         }


# # KFold
# n_splits=3
# skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
# scores_train = []
# scores_valid = []
# preds_valid_array = np.zeros((X.shape[0], ))
# print(preds_valid_array)

# preds_test_array = np.zeros((df_test.shape[0], ))
# print(preds_test_array)

# for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

#     print(f"Fold {fold+1} -------------->")
#     x_train, y_train = X[train_idx], y[train_idx]
#     x_valid, y_valid = X[valid_idx], y[valid_idx]

#     y_train_log = y_train
#     y_valid_log = y_valid

#     model= xgb.XGBClassifier(
#                                **params,
#                                eval_metric='auc'
#                                 )
#     model.fit(
#             x_train, y=y_train,
#             eval_set=[(x_valid, y_valid)],
#             early_stopping_rounds=50,
#             verbose=100
#             )

#     preds_train = model.predict_proba(x_train)[:, 1]
#     preds_valid = model.predict_proba(x_valid)[:, 1]
#     preds_test = model.predict_proba(df_test)[:, 1]
    
#     preds_valid_array[valid_idx] += preds_valid
#     preds_test_array += preds_test / n_splits
    
#     score_train = metrics.roc_auc_score(y_train, preds_train)
#     score_valid = metrics.roc_auc_score(y_valid, preds_valid)
#     print(score_valid)
#     scores_train.append(score_train)
#     scores_valid.append(score_valid)
        
# print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
# print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

# #pd.DataFrame({'target': preds_valid_array}).to_csv('xgb_valid.csv', index=False)
# sample.iloc[:, 1] = preds_test_array
# sample.to_csv('submission.csv', index=False)

In [None]:
# import gc
# gc.collect()

In [None]:
# params = {
#         "objective": "binary",
#         "metric": "auc",
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#         "device" : "gpu"
#     }

In [None]:
# traindata = lgb.Dataset(xtrain , label = ytrain)
# valdata = lgb.Dataset(xtest , label = ytest)

In [None]:
# import optuna
# import optuna.integration.lightgbm as lgb

In [None]:
# model = lgb.train(params, train_set = traindata , valid_sets = [traindata , valdata] , verbose_eval=100, early_stopping_rounds=100)
# best_params = model.params
# print("Best params:", best_params)

In [None]:
# sample['target'] = model.predict_proba(df_test_ogi)

In [None]:
# print("  Params: ")
# for key, value in best_params.items():
#     print("    {}: {}".format(key, value))

In [None]:
lgbm_params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'device': 'gpu',
    'feature_pre_filter': 'False',
    'lambda_l1': 7.630344575773596,
    'lambda_l2': 0.20806251221905683,
    'num_leaves': 10,
    'feature_fraction': 0.48000000000000004,
    'bagging_fraction': 0.8196480879728116,
    'bagging_freq': 3,
    'min_child_samples': 10,
    'num_iterations': 1000,
    'early_stopping_round': 100 }

In [None]:
import lightgbm as lgb
splits = 5
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Creating an array of zeros for storing "out of fold" predictions
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_auc = 0
preds_test_array = np.zeros((df_test.shape[0], ))

# Generating folds and making training and prediction for each of 10 folds
for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = lgb.LGBMClassifier(**lgbm_params)
    model.fit(X_train, y_train,
              verbose=False,
              # These three parameters will stop training before a model starts overfitting 
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=300,
              )
    
    # Getting mean test data predictions (i.e. devided by number of splits)
    preds += model.predict_proba(df_test)[:, 1] / splits
    preds_test_array += preds / splits
    
    # Getting mean feature importances (i.e. devided by number of splits)
    model_fi += model.feature_importances_ / splits
    
    # Getting validation data predictions. Each fold model makes predictions on an unseen data.
    # So in the end it will be completely filled with unseen data predictions.
    # It will be used to evaluate hyperparameters performance only.
    oof_preds[valid_idx] = model.predict_proba(X_valid)[:, 1]
    # Getting score for a fold model
    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    print(f"Fold {num} ROC AUC: {fold_auc}")

    # Getting mean score of all fold models (i.e. devided by number of splits)
    total_mean_auc += fold_auc / splits
    
print(f"\nOverall ROC AUC: {total_mean_auc}")
sample.iloc[:, 1] = preds_test_array
sample.to_csv('submission.csv', index=False)

In [None]:
sampledf = pd.read_csv("submission.csv")
sampledf.head()