In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold, StratifiedKFold

# Importing modelling packages
from lightgbm import LGBMClassifier
import optuna
import gc

# Removes warning
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
sample_submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

In [None]:
print(train.shape)
print(test.shape)

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                                    ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

train = reduce_memory_usage(train, verbose=True)
test = reduce_memory_usage(test, verbose=True)
#print(cpu_stats())
print('Memory reduced')

In [None]:
features = []
categorical = []
numerical = []
for feature in train.columns:
    if feature not in ['id', 'target']:
        features.append(feature)
        if train[feature].dtypes=='int8':
            categorical.append(feature)
        if train[feature].dtypes=='float16':
            numerical.append(feature)
print("Size of train dataframe",train.shape)
print("Total number of categorical features is ", len(categorical))
print("Total number of numerical features is", len(numerical))

In [None]:
train.head()

In [None]:
y = train['target']
train = train.drop(columns = ['target', 'id'])
test_id = test['id']
test = test.drop(columns = ['id'])

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train[numerical] = scaler.fit_transform(train[numerical])
test[numerical] = scaler.transform(test[numerical])

In [None]:
missing_train = train.isnull().sum().sum()
missing_test = test.isnull().sum().sum()
print('Total missing value in train dataset is:', missing_train)
print('Total missing value in test dataset is:', missing_test)

In [None]:
print(train.dtypes.value_counts())

In [None]:
print(train.shape)
print(test.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.3, random_state=50)

In [None]:
fit_params = dict(early_stopping_rounds = 200,
                  eval_set = [(X_train, y_train), (X_test, y_test)], 
                  eval_metric = 'auc', 
                  verbose = 200)

rs_params = dict(learning_rate = [0.05],
                 reg_lambda = [0, 20],
                 n_estimators = [5000],
                 max_depth = [7, 10],
                 subsample = [0.8, 0.9],
                 colsample_bytree = [0.8, 0.9],
                 reg_alpha = [20, 40])
param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        #"reg_lambda": 20,
        "n_estimators": 1500,
        "max_depth": 10,
        #"colsample_bytree": 0.9,
        #"reg_alpha": 40,
        "lambda_l1": 27.02439337450957,
        "lambda_l2": 0.5371336378863284,
        "num_leaves": 53,
        "feature_fraction": 0.8661114747237886,
        "bagging_fraction": 0.6010795190699094,
        "bagging_freq": 7,
        "min_child_samples": 54
}
params={'reg_alpha': 8.158768860412389, 'reg_lambda': 8.793022151019823, 'colsample_bytree': 0.2, 'subsample': 0.4, 'learning_rate': 0.02,
       'max_depth': 100, 'num_leaves': 12, 'min_child_samples': 68, 'cat_smooth': 91,'objective': 'binary',  
            'random_state': 48,'n_estimators': 20000,'n_jobs': -1}

In [None]:
#lgb = LGBMClassifier('reg_alpha': 8.158768860412389, 'reg_lambda': 8.793022151019823, 'colsample_bytree': 0.2, 'subsample': 0.4, 'learning_rate': 0.02,
 #      'max_depth': 100, 'num_leaves': 12, 'min_child_samples': 68, 'cat_smooth': 91,'objective': 'binary',  
  #          'random_state': 48,'n_estimators': 20000,'n_jobs': -1)

In [None]:
#lgb.fit(X_train, y_train, early_stopping_rounds = 200,
 #              eval_set = [(X_test, y_test)], 
  #             eval_metric = 'auc', 
   #            verbose = 200)

In [None]:
preds = np.zeros(test.shape[0])

kf = StratifiedKFold(n_splits = 5, random_state=20210,shuffle=True)

auc = []
n = 0

for train_idx, test_idx in kf.split(train,y):
    X_train, X_val = train.iloc[train_idx], train.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 100, eval_metric = "auc", verbose = "False")
    preds += model.predict_proba(test)[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1      

In [None]:
#pred_lgbm = lgb.predict_proba(X_test)[:, -1]

In [None]:
# Generate ROC curve values: fpr, tpr, thresholds
#fpr, tpr, thresholds = roc_curve(y_test, pred_lgbm)
# Plot ROC curve
#plt.plot([0, 1], [0, 1], 'k--')
#plt.plot(fpr, tpr)
#plt.xlabel('False Positive Rate')
#plt.ylabel('True Positive Rate')
#plt.title('ROC Curve')
#plt.show()

In [None]:
#test_score = roc_auc_score(y_test,pred_lgbm)
#print('AUC score for test data: {:.2f} %'.format(test_score*100))

In [None]:
#y_pred = lgb.predict_proba(test)[:, -1]
#y_pred

In [None]:
#Submit
submission = pd.DataFrame({
   'id': test_id,
  'target': preds
})
submission.to_csv('submission.csv', index=False)
print("predictions successfully submitted")

In [None]:
#import lightgbm as lgb
#import sklearn.metrics

In [None]:
#def objective(trial):
   # train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    #train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size = 0.3, random_state=50)
    #dtrain = lgb.Dataset(train_x, label=train_y)

   # param = {
    #    "objective": "binary",
     #   "metric": "auc",
      #  "verbosity": -1,
       # "boosting_type": "gbdt",
       # "learning_rate": 0.05,
        #"reg_lambda": 20,
       # "n_estimators": 1500,
        #"max_depth": 10,
        #"colsample_bytree": 0.9,
        #"reg_alpha": 40,
        #"lambda_l1": trial.suggest_float("lambda_l1", 0.5, 50.0, log=True),
        #"lambda_l2": trial.suggest_float("lambda_l2", 0.005, 50.0, log=True),
        #"num_leaves": trial.suggest_int("num_leaves", 2, 256),
        #"feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        #"bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        #"min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
   # }

    #gbm = lgb.train(param, dtrain)
    #preds = gbm.predict_proba(valid_x)[:, -1]  #gbm.predict(valid_x)
    #pred_labels = np.rint(preds)
    #lgb = LGBMClassifier(**param)
   # lgb.fit(train_x, train_y, early_stopping_rounds = 200,
    #            eval_set = [(valid_x, valid_y)], 
     #           eval_metric = 'auc', 
      #          verbose = 200)
    #preds = lgb.predict_proba(valid_x)[:, -1]
    #accuracy = roc_auc_score(valid_y, preds)# sklearn.metrics.accuracy_score(valid_y, pred_labels) 
    #return accuracy

In [None]:
#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=10)

#print("Number of finished trials: {}".format(len(study.trials)))

#print("Best trial:")
#trial = study.best_trial
#print("  Value: {}".format(trial.value))

#print("  Params: ")
#for key, value in trial.params.items():
 #   print("    {}: {}".format(key, value))