# <span style='color:#A80808'>Objective</span>

This notebook provides a baseline Lightgbm model. A [fast comparison](https://www.kaggle.com/sytuannguyen/model-selection) has shown that Lightgbm belong to the top two model for solving this problem. 

In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier as lgbc

# <span style='color:#A80808'>Data</span>

Data used for training the model is prepared by this [notebook](https://www.kaggle.com/sytuannguyen/spaceship-titanic-feature-engineering).

In [None]:
train = pd.read_csv('../input/spaceship-titanic-feature-engineering/train.csv')
train_targets = train.pop('Transported').astype('int64')
train.head(3)

# <span style='color:#A80808'>LGBMClassifier</span>

In [None]:
# hyperparameters
'''params=dict(boosting_type='gbdt', 
            num_leaves=31, 
            max_depth=- 1, 
            learning_rate=0.01, 
            n_estimators=1000, 
            subsample_for_bin=200000, 
            objective=None, 
            class_weight=None, 
            min_split_gain=0.0, 
            min_child_weight=0.001, 
            min_child_samples=20, 
            subsample=1.0, 
            subsample_freq=0, 
            colsample_bytree=1.0, 
            reg_alpha=0.0, 
            reg_lambda=0.0, 
            random_state=None, 
            n_jobs=- 1, 
            importance_type='split')'''

params=dict(bagging_fraction=0.8, bagging_freq=0, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=1.0,
               importance_type='split', learning_rate=0.15, max_depth=-1,
               min_child_samples=11, min_child_weight=0.001, min_split_gain=0.4,
               n_estimators=180, n_jobs=-1, num_leaves=8, objective=None,
               random_state=8842, reg_alpha=0.005, reg_lambda=0.2,
               silent='warn', subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

# <span style='color:#A80808'>Cross-validation</span>

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores=[]
models=[]
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train_targets)):
    X_train = train.iloc[train_idx]
    X_val = train.iloc[val_idx]
    y_train = train_targets[train_idx]
    y_val = train_targets[val_idx]
    
    model = lgbc(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=0)
    models.append(model)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_pred, y_val)

    print(f'Fold {fold}, accuracy score: {score}')
    print('_'*60)
    scores.append(score)

print(f'Average accuracy score: {np.mean(scores)}')

# <span style='color:#A80808'>Prediction</span>

In [None]:
test = pd.read_csv('../input/spaceship-titanic-feature-engineering/test.csv')

In [None]:
preds = []
for model in models:
    preds.append(model.predict(test))

# <span style='color:#A80808'>Submission</span>

In [None]:
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [None]:
submission['Transported'] = stats.mode(np.array(preds), axis=0)[0].reshape(-1).astype(bool)
submission.to_csv("submission.csv", index=False)

submission.head()