> NOTE: Some ideas for feature engineering have been taken from the brilliant notebooks by [Anisotropic](https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python) and [Sina](https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier).

In [None]:
# load libraries
import os
import gc
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn import metrics
from sklearn import linear_model
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb

sns.set_palette('deep')

%matplotlib inline

In [None]:
# read the data
path = '/kaggle/input/tabular-playground-series-apr-2021'
train_data = pd.read_csv(os.path.join(path, 'train.csv'))
test_data = pd.read_csv(os.path.join(path, 'test.csv'))
train_data.head()

In [None]:
train_data.info()

In [None]:
test_data.head()

In [None]:
# distribution of target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=train_data, ec='k')
plt.show()

In [None]:
# checking the survival rates
women = train_data.loc[train_data['Sex'] == 'female']['Survived']
survival_rate_women = sum(women) / len(women)
print('Survival rate of women: {:.2f}'.format(survival_rate_women*100))

In [None]:
men = train_data.loc[train_data['Sex'] == 'male']['Survived']
survival_rate_men = sum(men) / len(men)
print('Survival rate of men: {:.2f}'.format(survival_rate_men*100))

## Visualizing Data

In [None]:
# checking the distribution of the features
plt.figure(figsize=(8, 6))
sns.histplot(data=np.log(train_data['Fare']), bins=30)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data=train_data['Age'], bins=30)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Pclass', data=train_data, ec='k')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Sex', data=train_data, hue='Survived', ec='k')
plt.show()

## Data Preprocessing and Feature Engineering

In [None]:
# checking null values
for data in [train_data, test_data]:
    print(data.isnull().sum())
    print('-'*50)

In [None]:
# fill null values in the age 
def fillnan_age(df):
    age_avg = df['Age'].mean()
    age_std = df['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    random_age = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    df['Age'][df['Age'].isnull()] = random_age
    return df

In [None]:
data = [train_data, test_data]
for df in data:
    # fill the missing values
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df = fillnan_age(df)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # count all family members
    df['FamilyCount'] = df['SibSp'] + df['Parch'] + 1
    
    # ticket prefix
    df['TicketPrefix'] = df.Ticket.map(
        lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'N/A'
    )
    
    # last name
    df['LastName'] = df['Name'].map(lambda x: str(x).split(',')[0])
    duplicated_lastname = df['LastName'].value_counts().to_dict()
    df['NumLastName'] = df['LastName'].map(duplicated_lastname)

    # check if the passenger had a cabin
    df['HasCabin'] = df['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

    # new column if the passenger is alone
    df['IsAlone'] = (df['FamilyCount'] == 1).astype(int)
    
    # new column to calculate the logarithm of Fare distributed in 4 bins
    df['LogFareGroup'] = pd.cut(np.log(df['Fare']+0.0001), 4, labels=range(4))
    
    # Distribute the age in 5 bins
    df['AgeGroup'] = pd.cut(df['Age'], 5, labels=range(5))
    
#     df.drop(['Cabin', 'Ticket'] , axis=1, inplace=True)
    print(df.isnull().sum())
    print('-'*50)

## Visualizing new features

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='IsAlone', data=train_data, ec='k')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Embarked', data=train_data, ec='k')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='FamilyCount', data=train_data, ec='k')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='HasCabin', data=train_data, ec='k')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='LogFareGroup', data=train_data, hue='Survived', ec='k')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='AgeGroup', data=train_data, hue='Survived', ec='k')
plt.show()

In [None]:
# One hot encode the categorical features
df = pd.concat([train_data, test_data], axis=0)
df = pd.get_dummies(df, columns=['Pclass', 'Embarked', 'LogFareGroup', 'AgeGroup'])
label_encode_cols = ['Sex', 'TicketPrefix']
for col in label_encode_cols:
    le = preprocessing.LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
train_data = df.iloc[:len(train_data), :]
test_data = df.iloc[len(train_data):, :]
test_data.drop('Survived', axis=1, inplace=True)
del df
gc.collect()
train_data.head()

In [None]:
features = [col for col in train_data.columns if col not in 
            ['PassengerId', 'Name', 'Survived', 'SibSp', 'Parch', 'Cabin', 'Ticket', 'LastName']]
features

In [None]:
# feature correlations with target
correlations = pd.DataFrame(train_data[features + ['Survived']].corr()['Survived'])
correlations

In [None]:
pruned_features = correlations[(abs(correlations.Survived) > 0.08)].index.tolist()
pruned_features

In [None]:
# class to help training different models
class TrainHelper:
    def __init__(self, clf, seed=23, params=None, scale_features=None):
        self.normalized_features = False
        if params is not None:
            params['random_state'] = seed
            self.clf = clf(**params)
        else:
            self.clf = clf(random_state=seed)
        self.scale_features = scale_features
        if type(self.clf).__name__ in ['RidgeClassifier', 'LogisticRegression']:
            if scale_features is not None:
                self.normalized_features = True
            else:
                raise Exception('Cannot do feature scaling')

    def fit(self, X_train, y_train):
        if self.normalized_features:
            preprocessor = self.normalize()
            X_train = preprocessor.fit_transform(X_train)

        return self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        if self.normalized_features:
            preprocessor = self.normalize()
            X_test = preprocessor.fit_transform(X_test)
        return self.clf.predict(X_test)

    def normalize(self):
        ct = ColumnTransformer(
                [('scale', preprocessing.StandardScaler(), self.scale_features)],
                remainder='passthrough',
                n_jobs=-1
            )
        return ct

    def predict_proba(self, X_test):
        if hasattr(self.clf, 'predict_proba'):
            if self.normalized_features:
                preprocessor = self.normalize()
                X_test = preprocessor.fit_transform(X_test)
            return self.clf.predict_proba(X_test)[:, 1]
        else:
            return 'The classifier has no method predict_proba'

In [None]:
# applying k-fold cross-validation
def cross_validate(X, y, X_test, clf, n_folds, seed=23, threshold=0.7):
    print(f'Cross-validating for {type(clf.clf).__name__}')
    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    train_oof = np.zeros((len(X), ))
    test_preds = 0
    for f_, (t_, v_) in enumerate(kf.split(X=X, y=y)):
        X_train, y_train = X.loc[t_], y[t_]
        X_valid, y_valid = X.loc[v_], y[v_]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_valid)
        y_pred_proba = clf.predict_proba(X_valid)
        accuracy = metrics.accuracy_score(y_valid, y_pred)
        if not isinstance(y_pred_proba, str):
            roc_score = metrics.roc_auc_score(y_valid, y_pred_proba)
        else:
            roc_score = 0.0
        train_oof[v_] = y_pred
        test_preds += clf.predict(X_test) / n_folds
        print(f'Fold: {f_+1}, accuracy: {accuracy:.6f}, roc-auc score: {roc_score:.6f}')
    test_preds = (test_preds > threshold).astype(int)
    print(f'Overall training accuracy: {metrics.accuracy_score(train_oof, y):.6f}')
    print('-'*60)
    return train_oof, test_preds

In [None]:
train_data[features]

In [None]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
pruned_features = [feature for feature in pruned_features if feature != 'Survived']
X = train_data[features]
X_test = test_data[features]
y = train_data['Survived'].values
# print(y)
NUM_FOLDS = 10
SEED = 2020
scale_features = ['Age', 'Fare', 'FamilyCount', 'TicketPrefix', 'NumLastName']

# create models
lgbm_params = dict(
    n_jobs=-1,
    n_estimators=1000,
    learning_rate=0.02,
    num_leaves=39,
    colsample_bytree=0.6993443635848076,
    subsample=0.7146065596315723,
    max_depth=28,
    reg_alpha=13.0124692806962,
    reg_lambda=17.429087848443793,
    cat_smooth=8.61671087256764,
    min_split_gain=0.0222415,
    min_child_weight=39.3259775,
    silent=-1,
    verbose=-1,
)
lgbm = TrainHelper(clf=lgb.LGBMClassifier, params=lgbm_params)

cb_params = {'colsample_bylevel': 0.06780062117211266, 'depth': 13, 
             'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 
             'subsample': 0.10286610214134947, 'custom_loss':['Accuracy'],
             'logging_level':'Silent'}
cb = TrainHelper(clf=CatBoostClassifier, params=cb_params)

ridge_params = {'alpha': 0.01, 'fit_intercept': True}
ridge = TrainHelper(clf=linear_model.RidgeClassifier, params=ridge_params,
                    scale_features=scale_features)

# cross-validate for each model
lgbm_train_oof, lgbm_test_preds = cross_validate(X, y, X_test, lgbm, n_folds=NUM_FOLDS, 
                                                 seed=SEED, threshold=0.75)
cb_train_oof, cb_test_preds = cross_validate(X, y, X_test, cb, n_folds=NUM_FOLDS, 
                                             seed=SEED, threshold=0.75)
ridge_train_oof, ridge_test_preds = cross_validate(X, y, X_test, ridge, n_folds=NUM_FOLDS, 
                                                   seed=SEED, threshold=0.75)

In [None]:
# create dataframe for base level predictions
base_predictions_df = pd.DataFrame(
    {
        'LightGBM': lgbm_train_oof,
        'CatBoost': cb_train_oof,
        'RidgeClf': ridge_train_oof
    }
)
base_predictions_df = base_predictions_df.astype(int)
base_predictions_df

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(base_predictions_df.corr(), square=True, 
            cmap=plt.cm.RdBu, annot=True, alpha=0.6)
plt.show()

In [None]:
X_train_2 = base_predictions_df.values
X_test_2 = np.concatenate((lgbm_test_preds.reshape(-1, 1),
                           cb_test_preds.reshape(-1, 1),
                           ridge_test_preds.reshape(-1, 1)), axis=1)
xg = xgb.XGBClassifier(learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)
xg.fit(X_train_2, y)
predictions = xg.predict(X_test_2)
predictions = predictions.astype(int)
predictions[:5]

In [None]:
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 
                       'Survived': predictions})

output.to_csv('stacked_submission.csv', index=False)
print("Submission was successfully saved!")