In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import random
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score ,mean_absolute_error ,mean_squared_error ,auc
from sklearn.model_selection import train_test_split ,cross_val_score,cross_validate,KFold,StratifiedKFold
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder ,StandardScaler
from pandas_profiling import ProfileReport as pp
from warnings import filterwarnings
%matplotlib inline

In [None]:
filterwarnings("ignore")

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")
test_df.head()

In [None]:
Y = train_df.claim
Y.head()

In [None]:
X_train = train_df.drop(["id","claim"],axis=1)
X_train.head()

In [None]:
X_test = test_df.drop("id",axis=1)
X_test.head()

In [None]:
X_train.shape , X_test.shape

In [None]:
train_description = X_train.describe()
train_description.transpose()

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
train_null_values = []
for column in X_train.columns:
    column_nulls_count = X_train[column].isna().sum()
    null_percent = np.round(column_nulls_count / len(X_train[column]) *100,2)
    null_dictionary = {"feature":column , "feature_null_count(train)": column_nulls_count , "null percentage(train)":"{}%".format(null_percent)}
    train_null_values.append(null_dictionary)
train_null_df = pd.DataFrame(train_null_values,index=None).sort_values(by="feature_null_count(train)",ascending=False)
train_null_df.head()

In [None]:
test_bull_values = []
for column in X_test.columns:
    column_nulls_count = X_test[column].isna().sum()
    null_percent = np.round(column_nulls_count/len(X_test[column]) *100,2)
    null_dictionary = {"feature": column , "feature_null_count(test)": column_nulls_count , "null_percernt(test)":"{}%".format(null_percent)}
    test_bull_values.append(null_dictionary)
test_null_df =pd.DataFrame(test_bull_values,index=None).sort_values(by="feature_null_count(test)",ascending=False)
test_null_df.head()

In [None]:
total_null = pd.concat([train_null_df,test_null_df],axis=1)
total_null.head()

In [None]:
sns.histplot(data=Y,palette="viridis",bins=10,color="red")

In [None]:
claim_percentage = pd.DataFrame(Y.value_counts() / len(Y))
claim_percentage.T

In [None]:
sns.countplot(train_df.claim ,palette="Set3")

In [None]:
fig, axes = plt.subplots(nrows= 10 , ncols=6 ,figsize =(30,36))
column_number = 1
for i in range(0,10):
    for j in range(0,6):
        subchart = sns.histplot(data=X_train , x = str(X_train.columns[column_number]),ax =axes[i,j],color="red",label ="Train")
        subchart = sns.histplot(data=X_test , x = str(X_test.columns[column_number]),ax =axes[i,j],color="blue",label ="Train")
        column_number +=1
        


In [None]:
fig, axes = plt.subplots(nrows= 10 , ncols=6 ,figsize =(30,36))
column_number = 58
for i in range(0,10):
    for j in range(0,6):
        subchart = sns.histplot(data=X_train , x = str(X_train.columns[column_number]),ax =axes[i,j],color="red",label ="Train")
        subchart = sns.histplot(data=X_test , x = str(X_test.columns[column_number]),ax =axes[i,j],color="blue",label ="Train")
        column_number +=1
  

In [None]:
col_list = (X_train.columns)
col_list

In [None]:
simple_imputer = SimpleImputer(strategy="constant")
X_train[col_list] = simple_imputer.fit_transform(X_train[col_list])
X_test[col_list] = simple_imputer.transform(X_test[col_list])


In [None]:
from sklearn.preprocessing import MinMaxScaler
standard_scaler = StandardScaler()
X_train[col_list] = standard_scaler.fit_transform(X_train[col_list])
X_test[col_list] = standard_scaler.transform(X_test[col_list])

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
Y.head()

In [None]:
def score_check (X,Y,model,cv):
    scoring_method = ["accuracy"]
    scores = cross_validate(model,X,Y,scoring = scoring_method,cv=cv,return_train_score=True)
    scores =pd.DataFrame(scores).T
    return scores.assign( mean = lambda x:x.mean(axis=1) , std = lambda x: x.std(axis=1))


In [None]:
import optuna

In [None]:
def objective(trial):
    train_x  , valid_x , train_y , valid_y = train_test_split(X_train,Y , test_size = 0.25)
    dtrain = xgb.DMatrix(train_x , label=train_y)
    dvalid = xgb.DMatrix(valid_x , label=valid_y) 


    params = {"objective": "binary:logistic","verbosity":0 ,  "tree_method" : "gpu_hist" , 
              "booster" :trial.suggest_categorical("booster" , ["gbtree" ,"gblinear" , "dart"]) ,
             "lambda" : trial.suggest_float("lamda" , 1e-8 , 1.0 , log =True),
             "alpha" : trial.suggest_float("alpha" ,1e-8 , 1.0 ,log=True) , 
             "subsample" : trial.suggest_float("subsample", 0.2 , 1.0), 
             "colsample_bytree" : trial.suggest_float("colsample_bytree" , 0.2 ,1.0 )}
    
    if  params["booster"]  in ['gbtree' , 'dart']:
        params["max_depth"] = trial.suggest_int("max_depth" , 3,9 ,step = 2)
        params["normalize_type"] = trial.suggest_categorical("normalize_type" , ['tree', 'forest'])
        params["rate_drop"] = trial.suggest_float("rate_drop" , 1e-8 , 1.0 ,log =True )
        params["skip_drop"] = trial.suggest_float("skip_drop" , 1e-8 , 1.0 ,log =True)
        params["learning_rate"] =trial.suggest_float("learning_rate" ,0.05 ,0.2)
    
        
    if params["booster"] =="dart":
        params["sample_type"] = trial.suggest_categorical("sample_type" ,["uniform" , "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type" , ['tree', 'forest'])
        params["rate_drop"] = trial.suggest_float("rate_drop" , 1e-8 , 1.0 ,log = True)
        params["skip_drop"] = trial.suggest_float("skip_drop" , 1e-8 , 1.0 , log =True)
        params["learning_rate"] =trial.suggest_float("learning_rate" ,0.05 ,2)
    
    best = xgb.train(params , dtrain)
    predict = best.predict(dvalid)
    predict_labels = np.rint(predict)
    accuracy = accuracy_score(valid_y , predict_labels)
    return accuracy

In [None]:
study = optuna.create_study(direction = "maximize" ,study_name="Hamid")
study.optimize(objective,n_trials = 500 , timeout= 2400)


In [None]:
print ("Number of trials : " , len(study.trials))
print("Best trial is : ")
trial = study.best_trial
trial

In [None]:
print("value : {}".format(trial.value))
print( "params : ")
for key , value in trial.params.items():
    print("  {} : {}".format(key,value))


In [None]:
tuned_params = trial.params
tuned_params

In [None]:
 xgb_model= XGBClassifier(**tuned_params , tree_method = "gpu_hist")

In [None]:
score = score_check(X_train,Y,xgb_model,5)
score

In [None]:
xgb_model.fit(X_train,Y)
xgb_predict = xgb_model.predict(X_test)
xgb_submission = pd.DataFrame({"id":test_df["id"] ,"claim":xgb_predict })
xgb_submission.head()
xgb_submission.to_csv("/kaggle/working/xgb_submit.csv",index=False)

In [None]:
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier(iterations=2000,task_type="GPU",devices='0:1',learning_rate=0.1,depth=4 ,verbose=0)

In [None]:
score =score_check(X_train,Y,catboost_model,5)
score

In [None]:
catboost_model.fit(X_train,Y)
catboost_model_predict = catboost_model.predict(X_test)
catboost_model_submission = pd.DataFrame({"id":test_df.id , "claim":catboost_model_predict})
catboost_model_submission.to_csv('/kaggle/working/catboost_model.csv',index=False)

In [None]:
from lightgbm import LGBMClassifier
lgbm_model = LGBMClassifier(num_iterations = 1000 ,objective="binary",feature_per_filter = False,learning_rate=0.05,device_type = "gpu")

In [None]:
score =score_check(X_train,Y,lgbm_model,5)
score

In [None]:
lgbm_model.fit(X_train,Y)
lgbm_model_predict = lgbm_model.predict(X_test)
lgb_submission = pd.DataFrame({"id":test_df.id , "claim":lgbm_model_predict})
lgb_submission.to_csv('/kaggle/working/lgb.csv',index=False)