In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import plotly as py
from statistics import mean
import plotly.graph_objects as go
import plotly.express as px 
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from umap import UMAP

#Models
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold ,KFold
from sklearn.ensemble import VotingClassifier


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from skimage.filters import threshold_otsu
import lightgbm as lgb
import gc
from tqdm import tqdm



import optuna
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
SEED = 1

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")

In [None]:
d1 = pd.read_csv("../input/sep30data/traincat.csv")
dt1 = pd.read_csv("../input/sep30data/newcatboostC.csv")
d2 = pd.read_csv("../input/sep30data/trainSLight.csv")
dt2 = pd.read_csv("../input/sep30data/simple LightGBM.csv")
d3 = pd.read_csv("../input/sep30data/tcat2.csv")
dt3 = pd.read_csv("../input/sep30data/testcat2.csv")

In [None]:
train = pd.merge(train, d1, on='id')
train = pd.merge(train, d2, on='id')
train = pd.merge(train, d3, on='id')

In [None]:
train = train.rename(columns = {'catp': 'cat',}, inplace = False)

In [None]:
train.head()

In [None]:
test = pd.merge(test, dt1, on='id')
test = pd.merge(test, dt2, on='id')
test = pd.merge(test, dt3, on='id')

In [None]:
test = test.drop(['claim'],axis=1)

In [None]:
test.head()

In [None]:
train = train.set_index('id')
test = test.set_index('id')

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

train['n_missing'] = train[features].isna().sum(axis=1)
train['abs_sum'] = train[features].abs().sum(axis=1)
train['sem'] = train[features].sem(axis=1)
train['std'] = train[features].std(axis=1)
train['avg'] = train[features].mean(axis=1)
train['max'] = train[features].max(axis=1)
train['min'] = train[features].min(axis=1)

test['n_missing'] = test[features].isna().sum(axis=1)
test['abs_sum'] = test[features].abs().sum(axis=1)
test['sem'] = test[features].sem(axis=1)
test['std'] = test[features].std(axis=1)
test['avg'] = test[features].mean(axis=1)
test['max'] = test[features].min(axis=1)
test['min'] = test[features].min(axis=1)

In [None]:
fill_value_dict = {
    'f1': 'Mean', 
    'f2': 'Median', 
    'f3': 'Median', 
    'f4': 'Median', 
    'f5': 'Mode', 
    'f6': 'Mean', 
    'f7': 'Median', 
    'f8': 'Median', 
    'f9': 'Median', 
    'f10': 'Median', 
    'f11': 'Mean', 
    'f12': 'Median', 
    'f13': 'Mean', 
    'f14': 'Median', 
    'f15': 'Mean', 
    'f16': 'Median', 
    'f17': 'Median', 
    'f18': 'Median', 
    'f19': 'Median', 
    'f20': 'Median', 
    'f21': 'Median', 
    'f22': 'Mean', 
    'f23': 'Mode', 
    'f24': 'Median', 
    'f25': 'Median', 
    'f26': 'Median', 
    'f27': 'Median', 
    'f28': 'Median', 
    'f29': 'Mode', 
    'f30': 'Median', 
    'f31': 'Median', 
    'f32': 'Median', 
    'f33': 'Median', 
    'f34': 'Mean', 
    'f35': 'Median', 
    'f36': 'Mean', 
    'f37': 'Median', 
    'f38': 'Median', 
    'f39': 'Median', 
    'f40': 'Mode', 
    'f41': 'Median', 
    'f42': 'Mode', 
    'f43': 'Mean', 
    'f44': 'Median', 
    'f45': 'Median', 
    'f46': 'Mean', 
    'f47': 'Mode', 
    'f48': 'Mean', 
    'f49': 'Mode', 
    'f50': 'Mode', 
    'f51': 'Median', 
    'f52': 'Median', 
    'f53': 'Median', 
    'f54': 'Mean', 
    'f55': 'Mean', 
    'f56': 'Mode', 
    'f57': 'Mean', 
    'f58': 'Median', 
    'f59': 'Median', 
    'f60': 'Median', 
    'f61': 'Median', 
    'f62': 'Median', 
    'f63': 'Median', 
    'f64': 'Median', 
    'f65': 'Mode', 
    'f66': 'Median', 
    'f67': 'Median', 
    'f68': 'Median', 
    'f69': 'Mean', 
    'f70': 'Mode', 
    'f71': 'Median', 
    'f72': 'Median', 
    'f73': 'Median', 
    'f74': 'Mode', 
    'f75': 'Mode', 
    'f76': 'Mean', 
    'f77': 'Mode', 
    'f78': 'Median', 
    'f79': 'Mean', 
    'f80': 'Median', 
    'f81': 'Mode', 
    'f82': 'Median', 
    'f83': 'Mode', 
    'f84': 'Median', 
    'f85': 'Median', 
    'f86': 'Median', 
    'f87': 'Median', 
    'f88': 'Median', 
    'f89': 'Median', 
    'f90': 'Mean', 
    'f91': 'Mode', 
    'f92': 'Median', 
    'f93': 'Median', 
    'f94': 'Median', 
    'f95': 'Median', 
    'f96': 'Median', 
    'f97': 'Mean', 
    'f98': 'Median', 
    'f99': 'Median', 
    'f100': 'Mode', 
    'f101': 'Median', 
    'f102': 'Median', 
    'f103': 'Median', 
    'f104': 'Median', 
    'f105': 'Median', 
    'f106': 'Median', 
    'f107': 'Median', 
    'f108': 'Median', 
    'f109': 'Mode', 
    'f110': 'Median', 
    'f111': 'Median', 
    'f112': 'Median', 
    'f113': 'Mean', 
    'f114': 'Median', 
    'f115': 'Median', 
    'f116': 'Mode', 
    'f117': 'Median', 
    'f118': 'Mean'
}

for col in tqdm(features):
    if fill_value_dict.get(col)=='Mean':
        fill_value = train[col].mean()
    elif fill_value_dict.get(col)=='Median':
        fill_value = train[col].median()
    elif fill_value_dict.get(col)=='Mode':
        fill_value = train[col].mode().iloc[0]
    
    train[col].fillna(fill_value, inplace=True)
    test[col].fillna(fill_value, inplace=True)

In [None]:
X = train.drop(["claim"], axis=1)
X_test = test
y = train["claim"]

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
def objective(trial , data=X , target= y):
    params ={"max_depth" :trial.suggest_int("max_depth" ,2,8) , 
          "learning_rate" : trial.suggest_float("learning_rate" , 0.005 , 0.2),
          "n_estimators" : trial.suggest_int("n_estimators" , 1000 ,5000),
          "min_child_weight" : trial.suggest_int("min_child_weight" , 1,500),
          "gamma" : trial.suggest_float("gamma" ,0.0001 , 1.0 , log = True),
          "alpha": trial.suggest_float("alpha" , 0.0001 , 10 ,log = True),
          "lambda": trial.suggest_float("lambda" ,0.0001, 10.0 , log = True),
          "colsample_bytree": trial.suggest_float("colsample_bytree" , 0.1 , 0.8), 
          "subsample": trial.suggest_float("subsample" , 0.1,0.9),
          "tree_method" : "gpu_hist",
          "booster" : "gbtree",
           "random_state": 228 ,
           "use_label_encoder" : False,
           "eval_metric" : "auc"
          }
    model = XGBClassifier(**params)
    scores = []
    K = StratifiedKFold(n_splits=3,random_state=228 , shuffle=True)
    for i,(train_idx, val_idx) in enumerate(K.split(X,y)):
        X_train ,X_val = X[train_idx],X[val_idx]
        Y_train ,Y_val = y[train_idx],y[val_idx]
        model.fit(X_train ,Y_train ,eval_set = [(X_val,Y_val)] ,early_stopping_rounds =300 ,verbose = False)
        
        train_prediction = model.predict_proba(X_train)[:,1]
        train_score = roc_auc_score(Y_train ,train_prediction)
        
        validate_prediction = model.predict_proba(X_val)[:,1]
        validate_score = roc_auc_score(Y_val , validate_prediction)
        scores.append((train_score , validate_score))
        
        print(f"Fold {i+1} | AUC : {validate_score} ")
        
    scores = pd.DataFrame(scores ,columns=["train Score" , "Validation Score"])
    return scores["Validation Score"].mean() 

In [None]:
%%time

study = optuna.create_study(direction="maximize")
study.optimize(objective ,n_trials= 10)
print("Numbers of finished trials : " , len(study.trials))
print("Best Trials : ", study.best_trial.params)
print("Best Values : " , study.best_value)

In [None]:
xgb_params = study.best_trial.params
xgb_params

In [None]:
folds = StratifiedKFold(n_splits=5,random_state=228,shuffle=True)
predictions = np.zeros(len(X_test))
for fold,(train_idx,validate_idx) in enumerate(folds.split(X,y)):
    X_train,X_validate = X[train_idx] ,X[validate_idx]
    Y_train,Y_validate = y[train_idx] ,y[validate_idx]
    xgb_model = XGBClassifier(**xgb_params ,tree_method= "gpu_hist",booster = "gbtree" ,random_state = 228,use_label_encoder = False ,eval_metric = "auc")
    xgb_model.fit(X_train,Y_train,eval_set = [(X_validate,Y_validate)],verbose =False,early_stopping_rounds =300)
    predictions += xgb_model.predict_proba(X_test)[:,1] /folds.n_splits

In [None]:
sample  = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sample['claim']= predictions
sample.head()

In [None]:
sample.to_csv("Finalsubm1.csv",index=False)