# TPS0421 LGB+LGB.Dataset
This notebook referred to the followig notebook.<br/>
https://www.kaggle.com/sishihara/upura-kaggle-tutorial-05-tuning

In [None]:
import numpy as np
import pandas as pd
import optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
gender_submission = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")
data = pd.concat([train, test], sort=False)

In [None]:
data['Sex'].replace(['male','female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)
categorical_features = ['Embarked', 'Pclass', 'Sex']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

# #1 LGBM

In [None]:
params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

#### lightgbm.Dataset
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Dataset.html
#### lightgbm.train
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html

In [None]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

model = lgb.train(
    params, 
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
y_pred[:10]

In [None]:
y_pred1 = (y_pred > 0.5).astype(int)
y_pred1[:10]

In [None]:
sub1 = gender_submission
sub1['Survived'] = y_pred1
sub1.to_csv("submission1.csv", index=False)
sub1.head()

# #2 LGBM+Optuna

In [None]:
def objective(trial):
    params = {
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        'learning_rate': 0.05,
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

    model = lgb.train(
        params, lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        num_boost_round=1000,
        early_stopping_rounds=10
    )

    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [None]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=8)

In [None]:
study.best_params

In [None]:
params = {
    'objective': 'binary',
    'max_bin': study.best_params['max_bin'],
    'learning_rate': 0.05,
    'num_leaves': study.best_params['num_leaves']
}

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

model = lgb.train(
    params, 
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
y_pred2 = (y_pred > 0.5).astype(int)
y_pred2[:10]

In [None]:
sub2 = gender_submission
sub2['Survived'] = y_pred2
sub2.to_csv("submission2.csv", index=False)
sub2.head()