In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from mlxtend.classifier import StackingCVClassifier

## According to the ensemble inspiration from [Remek Kinas](https://www.kaggle.com/remekkinas) I have explored the Ensembing but with the ANN instad of Logistic Regression. If you like the code or have some ideas to improve please upvote and comment :) 

# 1. Description

The ensemble technique works best when the base models are not correlated.
We have 3 basics concept of ensembling techniques <br> 
***Max Voting***<br>
The prediction from each model is a vote. In max voting the final prediction come from the most votes
- classifier 1 – class A
- classifier 2 – class B
- classifier 3 – class B
- Output:     **Class B**

***Averaging***<br>
The final output is an average of all predictons (regression problems)
- regressor 1 – 200
- regressor 2 – 300 
- regressor 3 – 400
- Output:    **300**   

***Weighted Averaging***<br>
The base model with higher predictive power is more important.
- Output:     **Weighted Average*



# 2. Set up script parameters

In [None]:
SEED = 1992
PROBAS = True
FOLDS = 5

TARGET = 'Survived'

# 3. Load TPS-04 competition data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')
pseudo_labels = pd.read_csv("../input/tps04preds/dae.csv")
test[TARGET] = pseudo_labels[TARGET]

all_df = pd.concat([train, test]).reset_index(drop=True)
test['Survived'] = [x for x in pseudo_labels.Survived]
all_df = pd.concat([train, test]).reset_index(drop=True)

In [None]:
#Checking the null Data
null_data = (train.isna().sum().sort_values(ascending=False) / len(train) * 100)[:6] 
fig, ax = plt.subplots(1,1,figsize=(10, 7)) 
ax.bar(null_data.index, 100, color='#dadada', width=0.6) 
bar = ax.bar(null_data.index,null_data, width=0.6) 
ax.bar_label(bar, fmt='%.01f %%') 
ax.spines.left.set_visible(False) 
ax.set_yticks([]) 
ax.set_title('Null Data Ratio', fontweight='bold') 
plt.show()

# 4. Preprocess data
Preprocessing Logic: [BIZEN](https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble) notebook (as a benchmark) to compare results.

In [None]:
#Age fillna with mean age for each class
all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())

# Cabin, fillna with 'X' and take first letter
all_df['Cabin'] = all_df['Cabin'].fillna('X').map(lambda x: x[0].strip())

# Ticket, fillna with 'X', split string and take first split 
all_df['Ticket'] = all_df['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

# Fare, fillna with mean value
fare_map = all_df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
all_df['Fare'] = all_df['Fare'].fillna(all_df['Pclass'].map(fare_map['Fare']))
all_df['Fare'] = np.log1p(all_df['Fare'])

# Embarked, fillna with 'X' value
all_df['Embarked'] = all_df['Embarked'].fillna('X')

# Name, take only surnames
all_df['Name'] = all_df['Name'].map(lambda x: x.split(',')[0])

In [None]:
#The Feature Engineering Results
all_df.shape

# 5. Label Encoding

In [None]:
label_cols = ['Name', 'Ticket', 'Sex']
onehot_cols = ['Cabin', 'Embarked']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

scaler = StandardScaler()

onehot_encoded_df = pd.get_dummies(all_df[onehot_cols])
label_encoded_df = all_df[label_cols].apply(label_encoder)
numerical_df = pd.DataFrame(scaler.fit_transform(all_df[numerical_cols]), columns=numerical_cols)
target_df = all_df[TARGET]

#Remove Duplicates
all_df = all_df.loc[~all_df.index.duplicated(keep='first')]

#Concat all dataframes
all_df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)

# 6. Create Train and Test Datasets

In [None]:
X = all_df.drop([TARGET], axis = 1)
y = all_df[TARGET]

print (f'X:{X.shape} y: {y.shape} \n')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = SEED)
print (f'X_train:{X_train.shape} y_train: {y_train.shape}')
print (f'X_test:{X_test.shape} y_test: {y_test.shape}')

In [None]:
test = all_df[len(train):].drop([TARGET], axis = 1)
print (f'test:{test.shape}')

# 7. Create Meta Classifier

In [None]:
lgbm_params = {
    'metric': 'binary_logloss',
    'n_estimators': 9000,
    'objective': 'binary',
    'random_state': SEED,
    'learning_rate': 0.02,
    'min_child_samples': 150,
    'reg_alpha': 3e-5,
    'reg_lambda': 9e-2,
    'num_leaves': 20,
    'max_depth': 16,#16
    'colsample_bytree': 0.8,
    'subsample': 0.7,
    'subsample_freq': 2,
    'max_bin': 240,
    'device':'gpu'
}

cat_params = {#'iterations': 5000,
          'eval_metric': 'AUC',
          'loss_function':'Logloss',
          'od_type':'Iter',
          'num_trees':50000,
          'max_depth': 6, 
          'l2_leaf_reg': 3,
          'bootstrap_type': 'Bayesian',
          'max_bin': 254,
          'grow_policy': "Lossguide",
          'random_seed': 314,
          'min_data_in_leaf': 64,
          'verbose': None,
          'logging_level': 'Silent',
          'task_type': 'GPU'
}

ETC_params = {
    'bootstrap':True,
    'criterion': 'entropy',
    'max_features': 0.55,
    'min_samples_leaf': 8,
    'min_samples_split': 4,
    'n_estimators': 100
}

rf_params = {
    'max_depth': 15,
    'min_samples_leaf': 8,
    'random_state': SEED
}

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout,Reshape 
from tensorflow.keras.models import Model
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier #To teat ANN as classifier

def ann_network():
    i = Input(shape=(None, 32, 6))
    x = Dense(60, activation='relu')(i)  
    x = Dense(1, activation='sigmoid')(x) 
    model = Model(i, x)
    
    opt = tf.keras.optimizers.SGD(lr=1e-4, decay=1e-6, momentum=0.8, nesterov=True)
    model.compile(
        loss="binary_crossentropy",
        optimizer=opt,
        metrics=["accuracy"]
    )
    
    return model



In [None]:
from mlxtend.classifier import StackingCVClassifier,EnsembleVoteClassifier
from sklearn.linear_model import LogisticRegression

cl1 = CatBoostClassifier(**cat_params)
cl2 = LGBMClassifier(**lgbm_params)
cl3 = ExtraTreesClassifier(**ETC_params)

mlr = LogisticRegression()

ann_clf = KerasClassifier(lambda: ann_network(), epochs=4)

# ANN Ensembling
clf = StackingCVClassifier(classifiers= [cl1,cl2,cl3], 
                            meta_classifier = ann_clf, 
                            use_probas = True, 
                            random_state = SEED) 

# Hard Voting Ensemble
S_eclf = EnsembleVoteClassifier(clfs=[cl1, cl2, cl3],
                              weights=[1, 1, 2], voting='soft')

#Soft Voting Ensemble
H_eclf = EnsembleVoteClassifier(clfs=[cl1, cl2, cl3],
                              weights=[1, 1, 3], voting='hard')

#PseudoMeta classifier
AnnStakced_clf =  StackingCVClassifier(classifiers= [cl1,cl2,cl3],
                            meta_classifier = S_eclf, 
                            use_probas = True,    
                            random_state = SEED) 

classifiers = [clf,H_eclf,S_eclf,AnnStakced_clf]

# Fit the classifier variations
clf.fit(X_train, y_train) 
H_eclf.fit(X_train, y_train) 
S_eclf.fit(X_train, y_train)
AnnStakced_clf.fit(X_train, y_train) 

# 8. Predict and Validate (AUC)

In [None]:
preds = pd.DataFrame()
classifiers = {"stacked CLF": clf,
              'Soft voted CLF':S_eclf,
               'Hard voted CLF':H_eclf,
              'Ann Clf':AnnStakced_clf} 
NUM_CLAS = 4
for key in classifiers:
    try:
        y_pred = classifiers[key].predict_meta_features(X_test)[:,1]
    except:
        y_pred = classifiers[key].predict_proba(X_test)[:,1]
    preds[f"{key}"] = y_pred
    auc = metrics.roc_auc_score(y_test, y_pred)
    print(f"{key} -> AUC: {auc:.3f}")

preds[TARGET] = pd.DataFrame(y_test).reset_index(drop=True)

print(preds.sample(10))

# 9. Plot Results

In [None]:
sns.set(font_scale = 1)
sns.set_style({"axes.facecolor": "1.0", "axes.edgecolor": "0.85", "grid.color": "0.85",
               "grid.linestyle": "-", 'axes.labelcolor': '0.4', "xtick.color": "0.4",
               'ytick.color': '0.4'})

f, ax = plt.subplots(figsize=(13, 4), nrows=1, ncols = NUM_CLAS)

for key, counter in zip(classifiers, range(NUM_CLAS)):
    
    y_pred = preds[key]
   
    auc = metrics.roc_auc_score(y_test, y_pred)
    textstr = f"AUC: {auc:.3f}"


    false_pred = preds[preds[TARGET] == 0]
    sns.distplot(false_pred[key], hist=True, kde=True, 
                 bins=int(50), color = 'red', 
                 hist_kws={'edgecolor':'black'}, ax = ax[counter])
    

    true_pred = preds[preds[TARGET] == 1]
    sns.distplot(true_pred[key], hist=True, kde=True, 
                 bins=int(50), color = 'green', 
                 hist_kws={'edgecolor':'black'}, ax = ax[counter])
    
    
    props = dict(boxstyle='round', facecolor='white', alpha=0.5)
    
    ax[counter].text(0.05, 0.95, textstr, transform=ax[counter].transAxes, fontsize=14,
                    verticalalignment = "top", bbox=props)
    
    ax[counter].set_title(f"{key}")
    ax[counter].set_xlim(0,1)
    ax[counter].set_xlabel("Probability")

plt.tight_layout()

# 10. Final Prediction

In [None]:
# For ANN Classifier we can not use predict_proba or .predict. Instead of this .predict_meta_features

test_preds1 = clf.predict_meta_features(test)[:,1]
#test_preds2 = H_eclf.predict_meta_features(test)[:,1]


In [None]:
# tip -> Alexander Ryzhkov

threshold = pd.Series(test_preds1).sort_values(ascending = False).head(34911).values[-1]
threshold=threshold
print(f"Current threshold is: {threshold}")

# Creation of Multiple Submission for Voting
submission['submit_1'] = (test_preds1 > threshold).astype(int)
submission['submit_2'] = pseudo_labels[TARGET]


In [None]:

submission[[col for col in submission.columns if col.startswith('submit_')]].sum(axis = 1).value_counts()


In [None]:
submission[TARGET] = (submission[[col for col in submission.columns if col.startswith('submit_')]].sum(axis=1) >= 2).astype(int)
submission[TARGET].mean()

In [None]:
# Final File preparation
submission[['PassengerId', TARGET]].to_csv("Mstasko_final.csv", index = False)
submission["Survived"].hist()