**<h4>In brief:</h4>**
* <h4>Filled missing values</h4>
* <h4>Feature engineering & encoding</h4>
* <h4>LightGBM with default parameters and cross validation</h4>
* <h4>Submission</h4>

- <h4>For parameter tunning: <a href="https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-optuna">OPTUNA optimizer</a>



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb

In [None]:
def label_encoder(c):
    lc = LabelEncoder()
    return lc.fit_transform(c)

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

all_df = pd.concat([train_df, test_df])

<h4>Filling missing values and feature engineering</h4>

In [None]:
# Age fillna with mean age for each class
age_map = all_df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
all_df.Age = all_df.Age.fillna(all_df.Pclass.map(age_map['Age']))

# Cabin, fillna with 'X' and take first letter
all_df.Cabin = all_df.Cabin.fillna('X').map(lambda x: x[0].strip())

# Ticket, fillna with 'X', split string and take first split 
all_df.Ticket = all_df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

# Fare, fillna with mean value
all_df.Fare = all_df.Fare.fillna(all_df.Fare.mean())

# Embarked, fillna with 'X' value
all_df.Embarked = all_df.Embarked.fillna('X')

# Name, take only surnames
all_df.Name = all_df.Name.map(lambda x: x.split(',')[0])

<h4>Feature encoding</h4>

In [None]:
label_cols = ['Name', 'Ticket']
onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'Survived']

In [None]:
onehot_encoded_df = pd.get_dummies(all_df[onehot_cols])
label_encoded_df = all_df[label_cols].apply(label_encoder)
numerical_df = all_df[numerical_cols]

all_df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df], axis=1)

<h4>LightGBM</h4>

In [None]:
# Re-split all data
X = all_df[:train_df.shape[0]]
y = X.pop('Survived')
X_ = all_df[train_df.shape[0]:].drop(columns=['Survived'])

In [None]:
# Find the optimal params here: https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-optuna
params = {
    'metric': 'auc',
    'n_estimators': 10000,
    'objective': 'binary',
}

In [None]:
folds = KFold(n_splits = 16)
oof = np.zeros(X.shape[0])
predictions = np.zeros(X_.shape[0])

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print("Fold {}".format(fold_))
    X_train = X.iloc[trn_idx]
    y_train = y[trn_idx]
    X_test = X.iloc[val_idx]
    y_test = y[val_idx]
    clf = lgb.LGBMClassifier(**params, random_state=42)
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric='auc', early_stopping_rounds=500, verbose=500  )
    predictions += clf.predict_proba(X_, num_iteration=clf.best_iteration_)[:,1] / folds.n_splits

In [None]:
# Feature importance of last CV, just to get ideas where try to improve
lgb.plot_importance(clf)

<h4>Submission</h4>

In [None]:
binarizer = np.vectorize(lambda x: 1 if x >= .5 else 0)
prediction_binarized = binarizer(predictions)
submission = pd.concat([sample_submission,pd.DataFrame(prediction_binarized)], axis=1).drop(columns=['Survived'])
submission.columns = ['PassengerId', 'Survived']
submission.to_csv('submission.csv', index=False)