# Titanic Dataset

In [None]:
# import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
sns.set_style('darkgrid')

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgbm
from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import the data
train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
# Check for null values
train.isna().sum()

## Visualization

In [None]:
sns.countplot(data=train, x="Survived");

In [None]:
sns.countplot(data=train, x='Sex');

In [None]:
plt.figure(figsize=(14, 6))
sns.lineplot(data=train, y='Survived', x='Age');

In [None]:
train_male = train[train["Sex"]=='male']
train_female = train[train["Sex"]=='female']

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
sns.lineplot(data=train_male, y='Survived', x='Age', ax=ax[0])
sns.lineplot(data=train_female, y='Survived', x='Age', ax=ax[1]);

* #Females survived more than #males.

In [None]:
sns.countplot(data=train, x='Pclass')

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=train, x='Pclass', hue='Survived');

* #People from Pclass 1 and 2 survived more than class 3

In [None]:
train.groupby('Pclass').Fare.median()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=train, x='Parch', hue='Survived')

In [None]:
sns.countplot(data=train, x='Embarked', hue='Survived')

In [None]:
sns.countplot(data=train, x='SibSp');

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
sns.histplot(data=train, x='Fare', ax=ax[0])
sns.histplot(data=train, x='Age', ax=ax[1])

In [None]:
train.groupby('Pclass').Fare.median()

In [None]:
train

## Preprocessing

In [None]:
def fill_nan(df):
    
    # Fill Age
    df["Age"] = df["Age"].fillna(np.mean(train["Age"]))
    
    # Fill Fare
    median_fare = df['Fare'].median()
    df["Fare"] = df["Fare"].fillna(median_fare)
    
    # Fill cabin
    df["Cabin"] = df["Cabin"].fillna('X')
    
    # Fill embarked
    df["Embarked"] = df['Embarked'].fillna('X')
    
    # Fill ticket
    df['Ticket'] = df['Ticket'].fillna("X")
    
    return df

In [None]:
train = fill_nan(train)

In [None]:
def encode(df):
    le = LabelEncoder()
    
    # Encode cabin
    df['Cabin'] = le.fit_transform(df['Cabin'])
    
    # Encode Sex
    df["Sex"] = le.fit_transform(df["Sex"])
    
    # Encode Ticket
    df['Ticket'] = le.fit_transform(df["Ticket"])
    
    # Encode Embarked
    df_embarked = pd.get_dummies(df["Embarked"])
    
    # Concat one-hot encoded vectors into dataframe
    df = pd.concat([df, df_embarked], axis=1)
    
    return df

In [None]:
train_encoded = encode(train)

In [None]:
train_encoded

## Feature Engineering

In [None]:
def add_features(df):
    
    # Add family size
    df['family_size'] = df['SibSp'] + df['Parch']
    
    # See if the person is alone or not
    df['Is_alone'] = 0
    df.loc[df['family_size'] == 0, 'Is_alone'] = 1
    
    # Binning age
    ## From the visualization of age we can divide the age into 4 groups
    df['Binned_Age'] = pd.qcut(df['Age'], q=4)
    
    ## Encode binned age
    le = LabelEncoder()
    df['Binned_Age'] = le.fit_transform(df['Binned_Age'])
    
    # Bin fare
    df['Binned_fare'] = pd.qcut(df['Fare'], q=4)
    
    # Encode fare
    df['Binned_fare'] = le.fit_transform(df['Binned_fare'])
    
    return df

In [None]:
train = add_features(train_encoded)

## Coorelation matrix

In [None]:
corr = train.corr()

In [None]:
sns.heatmap(corr);

## Umap

* Umap is dimensionality reduction technique. It is faster than TSNE.
* UMAP is fast and preserves the global structure i.e. the distance between data points within clusters and between clusters are preserved.

In [None]:
umap_df = train.drop(['Name', 'Survived', 'PassengerId', 'Embarked'], axis=1)

In [None]:
from umap import UMAP

In [None]:
umap_2d = UMAP(n_components=2, init='random')
projections = umap_2d.fit_transform(umap_df)

In [None]:
# Visualize UMAP
fig = px.scatter(projections, x=0, y=1, color=train.Survived, labels={'color':'Survived', '0':'x_component', '1':'y_component'})
fig.show()

## Baseline Model

In [None]:
X = train.drop(["Name", 'Survived', 'PassengerId', 'Embarked'], axis=1)
y = train["Survived"]

In [None]:
def cross_validate(X, y, model, params, folds=5, display_clf_report=False):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]

        clf = model(**params)
        clf.fit(x_tr, y_tr,
                eval_set=[(x_ts, y_ts)],
                early_stopping_rounds=100,
                verbose=False)

        pred = clf.predict(x_ts)
        score = accuracy_score(y_ts, pred)
        loss = log_loss(y_ts, pred)
        print(f" Log loss: {loss}")
        print(f" Accuracy: {score}")
        print()
        
        if display_clf_report:
            print(classification_report(y_ts, pred))
        
        print("-"*50)
    
    return clf

## LGBM Classifier

In [None]:
lgbm_params = {
    'objective':'binary',
    'boosting':'gbdt',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'objective': 'binary',
    'random_state': 42,
    'learning_rate': 0.002,
    'min_child_samples': 150,
    'reg_alpha': 0.003,
    'reg_lambda': 8.97,
    'num_leaves': 20,
    'max_depth': 45,
    #'colsample_bytree': 0.18,
    #'subsample': 0.013,
    'subsample_freq': 2,
    "bagging_fraction":0.65,
    "feature_fraction":0.65,
    'max_bin': 33
}

In [None]:
lgbm_model = cross_validate(X, y, LGBMClassifier, lgbm_params, folds=5, display_clf_report=False)

## Catboost

In [None]:
cb_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'random_seed': 42,
    'max_depth': 8,
    'learning_rate': 0.01,
    'n_estimators': 2000,
    'max_bin': 280,
    'min_data_in_leaf': 64,
    'l2_leaf_reg': 0.01,
    'subsample': 0.8
}

In [None]:
cbr_model = cross_validate(X, y, CatBoostClassifier, cb_params, folds=5)

## XGBoost

In [None]:
import gc
gc.collect()

In [None]:
xgb_params = {
    'eval_metric':'logloss',
    "seed":42,
    "n_estimators":1000
}

In [None]:
xgb_clf = cross_validate_boosting(X, y, XGBClassifier, xgb_params, folds=5)

## XGB Regressor

In [None]:
param_xgbr = {
    'eval_metric':'logloss',
    'n_estimators':1000,
    'seed':42
}

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
xgbr = XGBRegressor(**param_xgbr)
model_xgbr = xgbr.fit(x_train, y_train)

In [None]:
pred = model_xgbr.predict(x_test)

## Pseudo Labelling

In [None]:
test_pseudo = test.copy()

In [None]:
test_pseudo = fill_nan(test_pseudo)
test_encoded = encode(test_pseudo)
test_pseudo = add_features(test_encoded)

In [None]:
preds = model_xgbr.predict(test_pseudo[X.columns])
test_pseudo["target"] = preds

In [None]:
test_pseudo = test_pseudo[ (test_pseudo['target']<=0.01) | (test_pseudo['target']>=0.99) ].copy()
test_pseudo.loc[ test_pseudo['target']>=0.5, 'target' ] = 1
test_pseudo.loc[ test_pseudo['target']<0.5, 'target' ] = 0

In [None]:
test_pseudo['target'] = test_pseudo['target'].astype('int')

In [None]:
test_pseudo['Survived'] = test_pseudo['target']

In [None]:
test_pseudo.drop('target', axis=1, inplace=True)

In [None]:
train_pseudo = pd.concat([train, test_pseudo],axis=0)

In [None]:
X_pseudo = train_pseudo.drop(["Name", 'Survived', 'PassengerId', 'Embarked'], axis=1)
y_pseudo = train_pseudo["Survived"]

In [None]:
xgb_clf_pseudo = cross_validate_boosting(X_pseudo, y_pseudo, XGBClassifier, xgb_params, folds=5)

## Feature Importance

In [None]:
lgbm.plot_importance(lgbm_model);

In [None]:
xgb.plot_importance(xgb_clf_pseudo)

## Submission

In [None]:
test_nan = fill_nan(test)

In [None]:
test_encode = encode(test_nan)

In [None]:
test = add_features(test_encode)

In [None]:
pred_lgbm = lgbm_model.predict(test_encode[X.columns])
pred_cb = cbr_model.predict(test_encode[X.columns])
pred_xgb = xgb_clf.predict(test_encode[X.columns])
pred_pseudo = xgb_clf_pseudo.predict(test_encode[X.columns])

In [None]:
blend1 = 0.6 * pred_xgb + 0.4 * pred_lgbm
blend2 = 0.4 * pred_xgb + 0.3 * pred_lgbm + 0.3 * pred_cb
blend3 = 0.5 * pred_xgb + 0.5 * pred_pseudo

In [None]:
blend1 = blend1.astype('int')
blend2 = blend2.astype('int')
blend3 = blend3.astype('int')

In [None]:
# LGBM 
submission["Survived"] =  pred_lgbm
submission.to_csv("LGBM Predictions.csv", index=False)

# Catboost
submission["Survived"] = pred_cb
submission.to_csv("Catboost predictions.csv", index=False)

# XGBoost
submission['Survived'] = pred_xgb
submission.to_csv("XGBoost predictions.csv", index=False)

# XGB Pseudo
submission['Survived'] = pred_pseudo
submission.to_csv("XGBoost(Pseudo) predictions.csv", index=False)

# Blend 1
submission['Survived'] = blend1
submission.to_csv("Blending 1.csv", index=False)

# Blend 2
submission['Survived'] = blend2
submission.to_csv("Blending 2.csv", index=False)

# Blend 3
submission['Survived'] = blend3
submission.to_csv("Blending 3.csv", index=False)