In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Utils

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import StratifiedKFold
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_uniform
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise  import cosine_similarity
from sklearn.metrics.pairwise import chi2_kernel

## Load Data

In [None]:
train= pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv', sep=',')
sub_sample = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv', sep=',')
test= pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv', sep=',')

In [None]:
test.info()

In [None]:
print(train.shape, test.shape, sub_sample.shape)

In [None]:
sub_sample.head()

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
train.describe()

### Data Preprocessing

In [None]:
train = train.set_index('PassengerId')

using the Median to fill all the NAN for Age and Ticket Fare

In [None]:
train['Age'] = train['Age'].replace(np.nan, train['Age'].median())
train['Fare'] = train['Fare'].replace(np.nan, train['Fare'].median())

Fill Embarked NAN with the mode 'S'

In [None]:
#it must require a most accurate strategy to fillna
train['Embarked']= train['Embarked'].replace(np.nan, 'S')

Discard for the moment Ticket and Cabin

In [None]:
train = train.drop(columns=['Ticket', 'Cabin'])

Using Only the Surname from the "Name" field

In [None]:
train['Name'] = train['Name'].str.split(',',1).str[0]

In [None]:
train.dtypes

In [None]:
Define the Features to be Encoded

In [None]:
l=[]
for i in train.columns:
    if train[i].dtype=='O':
        l.append(i)
print(l)

In [None]:
l2=[]
for i in train.columns:
    if train[i].dtype!='O':
        l2.append(i)
print(l2)

In [None]:
df_num = train[l2[1:]]
df_cat = train[l]
df_target = train[l2[0]]

In [None]:
import category_encoders as ce
woe=ce.woe.WOEEncoder(return_df=True, drop_invariant=True, handle_missing='value')
woe_enc=woe.fit_transform(df_cat, df_target)
woe_enc= woe_enc.set_index(df_cat.index)

In [None]:
df = pd.concat([df_num, woe_enc, df_target], axis=1)

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(data=df.corr())

In [None]:
df = df.drop(columns='Survived')

In [None]:
df.head()

In [None]:
df_target.head()

# Using Optuna with Lgbm

In [None]:
import optuna

In [None]:
def objective(trial , data = df , target = df_target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
            test_size = 0.028059109276941666 , random_state = 22)

    #test_size = 0.028059109276941666
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 10),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-5 , 10),
        'num_leaves' : trial.suggest_int('num_leaves' , 11 , 800),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.0000001 , 0.1),
        'max_depth' : trial.suggest_int('max_depth' , 5 , 400),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 110),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
        'subsample' : trial.suggest_uniform('subsample' , 1e-5 , 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-5 , 1),
        'random_state' : trial.suggest_categorical('random_state' , [1,22,2022,1509]),
        'metric' : 'auc',
        'device_type' : 'cpu',
    }
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] , early_stopping_rounds = 1000 , \
             verbose = False)
    preds = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y , preds)
    return auc

In [None]:
study = optuna.create_study(direction = 'maximize' , study_name = 'lgbm')
study.optimize(objective , n_trials = 100)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)

In [None]:
#the best value: 0.8600080851524926
params={'reg_alpha': 3.5207906746166246e-05, 'reg_lambda': 9.571210867333512, 'num_leaves': 23, 'learning_rate': 0.042076158882403084, 'max_depth': 367, 'n_estimators': 3606, 'min_child_samples': 102, 'min_child_weight': 0.01526093257122045, 'subsample': 0.9975048106315406, 'colsample_bytree': 0.19751970469246108, 'random_state': 2022}

## Preprocessing the Test_set

In [None]:
test = test.set_index('PassengerId')
test['Age'] = test['Age'].replace(np.nan, test['Age'].median())
test['Fare'] = test['Fare'].replace(np.nan, test['Fare'].median())
test['Embarked']= test['Embarked'].replace(np.nan, 'S')
test = test.drop(columns=['Ticket', 'Cabin'])
test['Name'] = test['Name'].str.split(',',1).str[0]

In [None]:
lt=[]
for i in test.columns:
    if test[i].dtype=='O':
        lt.append(i)
print(lt)

In [None]:
lt2=[]
for i in test.columns:
    if test[i].dtype!='O':
        lt2.append(i)
print(lt2)

In [None]:
dft_num = test[lt2]
dft_cat = test[lt]
dft_target = test[lt2[0]]

In [None]:
woet_enc=woe.transform(dft_cat)
woet_enc= woet_enc.set_index(dft_cat.index)

In [None]:
dft= pd.concat([dft_num,woet_enc], axis=1)

In [None]:
df.head()

In [None]:
dft.head()

In [None]:
params['metric'] = 'auc'
params['device'] = 'cpu'
preds = np.zeros(dft.shape[0])
oof_preds = np.zeros(df.shape[0])
kf = StratifiedKFold(n_splits = 10 , random_state = 22 , shuffle = True)
roc = []
n = 0
for trn_idx , val_idx in kf.split(df , df_target):
    train_x = df.iloc[trn_idx]
    train_y = df_target.iloc[trn_idx]
    val_x = df.iloc[val_idx]
    val_y = df_target.iloc[val_idx]
    
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] , early_stopping_rounds = 2000 , \
             verbose = False)
    clf = CalibratedClassifierCV(model, cv='prefit', method='sigmoid')
    clf.fit(train_x , train_y)
    preds += clf.predict_proba(dft)[:,1]/kf.n_splits
    oof_preds += clf.predict_proba(df)[:,1]/kf.n_splits
    roc.append(roc_auc_score(val_y , clf.predict_proba(val_x)[:,1]))
    print(n+1 , roc[n])
    n+=1

In [None]:
sub_sample.head()

In [None]:
sub_sample['Survived'] = preds

In [None]:
#simple threshold
sub_sample['Survived'] = sub_sample['Survived'].apply(lambda x: 1 if x>0.5 else 0)

In [None]:
sub_sample.to_csv('submission.csv',index=False)