# Titanic, take #2
Learning from my previous try https://www.kaggle.com/lovroselic/titanic-ls <br>
Good points: <br>
* Joining train and test for EDA, and split again for modeling
* Survival groups, Family survival rate
* using soft voting classifier, with treshold of 0.65
* classifiers that are used: XGB, RF, SVM, ADA, GBC, SGD, ETC, DT,DNN, KNN, LR, LGBM

# Imports & config

In [None]:
import time
from datetime import datetime

#measure notebook running time
start_time = time.time()

%matplotlib inline

import os, warnings
import numpy as np 
from numpy.random import seed
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import metrics
import tensorflow as tf
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from category_encoders import MEstimateEncoder

sns.set(style='white', context='notebook', palette='deep', rc={'figure.figsize':(10,8)})
print("loaded ...")

In [None]:
# Reproducibility
def set_seed(sd=13):
    seed(sd)
    np.random.seed(sd)
    tf.random.set_seed(sd)
    os.environ['PYTHONHASHSEED'] = str(sd)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

# Load and check data

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data['Survived'] = -1
train_data['Set'] = "Train"
test_data['Set'] = "Test"
DATA = train_data.append(test_data)
DATA.reset_index(inplace=True)
DATA.dtypes

## Missing data
* 20% of age values missing, we don't care: using title!
    * Age was later used, and filled
* 77% of cabin values missing (that is itself a value of feature)

In [None]:
print("Missing data:\n", DATA.isna().sum())

### Age

In [None]:
index_NaN_age = list(DATA["Age"][DATA["Age"].isnull()].index)
med_age = DATA.Age.median()
for i in index_NaN_age:
    pred_age = DATA["Age"][((DATA.SibSp == DATA.iloc[i]['SibSp']) & (DATA.Parch == DATA.iloc[i]['Parch']))].median()
    if np.isnan(pred_age):
        DATA.loc[DATA.index[i],'Age'] = med_age
    else:
        DATA.loc[DATA.index[i],'Age'] = pred_age

In [None]:
g = sns.FacetGrid(train_data,col="Survived")
g = g.map(sns.histplot, "Age", kde=True)
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle("Age distribution over Survived");

# Feature selection & engineering

## Sex

In [None]:
g = sns.catplot(x="Sex",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Sex")

In [None]:
DATA['SexN'] = DATA.Sex.apply(lambda row: 1 if row == "female" else 0)

## Adult man

In [None]:
def extract(sex, age):
    if sex == 'male' and age > 15: return 1
    return 0
    
DATA['AdultMan'] = DATA[['Sex','Age']].apply(lambda row: extract(*row), axis=1)

In [None]:
g = sns.catplot(x="AdultMan",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Sex")

## Age to categorical

In [None]:
def cut_age(age):
    if age <= 15:
        return 'child'
    if age >= 60:
        return 'senior'
    return 'adult'

DATA['Age_Cat'] = DATA.Age.apply(cut_age)

In [None]:
g = sns.catplot(x="Age_Cat",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Age_Cat")

## Information from name
* Title indicates both sex and age 

In [None]:
def get_title(sex,name):
    split_name = name.split(",")
    surname = split_name[0].strip(" ")
    title = split_name[1].split('.')[0].strip(' ')
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    if title not in common_titles:
        title = sex
    return title,surname
    
    
DATA[['Title', 'Surname']] = DATA[['Sex','Name']].apply(lambda row: get_title(*row), axis=1, result_type= 'expand')
DATA.head(10)

#DATA[DATA.Surname == 'Peacock'] #poor people, they all die
#DATA[DATA.Surname == 'Elias'] #family of men, doomed


### Probability of Survival based on title
* all rare titles grouped in male or female

In [None]:
g = sns.catplot(x="Title",y="Survived", data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Title")

## Family size
* I will bin it to categorical

In [None]:
DATA['FamilySize'] = DATA.SibSp + DATA.Parch + 1
g = sns.catplot(x="FamilySize",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - FamilySize")

In [None]:
def familySize_to_cat(size):
    if size == 1: return "alone"
    if size >= 2 and size <= 4: return 'small'
    if size >= 5 and size <= 7: return 'medium'
    if size > 7 : return "large"

DATA['FamilySizeCategory'] = DATA.FamilySize.apply(familySize_to_cat)

In [None]:
g = sns.catplot(x="FamilySizeCategory",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - FamilySizeCategory")

## SibSp, Parch

In [None]:
g = sns.catplot(x="SibSp",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - SibSp")

In [None]:
g = sns.catplot(x="Parch",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Parch")

## Embarked, missing values
* S is majority, fill with this

In [None]:
DATA.Embarked.fillna("S",inplace=True)

In [None]:
ax = sns.countplot(data = DATA[DATA.Set == 'Train'], x = 'Embarked', hue = "Survived");
ax.set_title("Embarked");

## Survival groups (SG) and FamilySurvivalRate (FSR)
* SG instead of FSR, works better:
* adult_male: prepare to die
* no_chilren: male doomed, female probably survives
* solo_kid: not neccesarily alone (eg: father + daughter, daughter has chance, father not so ...)
* group_survived: majority of the woman+children family group survived
* group_died: majority of the woman+children family group died
* added FSR back again, it's like duplicate , but numeric feature

In [None]:
for i in DATA.index:
    
    # adult male
    if DATA.iloc[i]['Sex'] == 'male' and DATA.iloc[i]['Age_Cat'] == 'adult':
        DATA.loc[DATA.index[i],'SG'] = "adult_male"
        # based on actual ratio
        DATA.loc[DATA.index[i],'FSR'] = 0.15
        #DATA.loc[DATA.index[i],'FSR'] = 0
        continue
        
    family = DATA[(DATA.Surname == DATA.iloc[i]["Surname"]) & (DATA.Ticket == DATA.iloc[i]["Ticket"])]
    kids = family[family.Age_Cat == 'child']
    N_kids = len(kids)
    
    # no_children
    if N_kids == 0:
        DATA.loc[DATA.index[i],'SG'] = "no_children"
        #DATA.loc[DATA.index[i],'FSR'] = 1
        # based on actual ratio
        DATA.loc[DATA.index[i],'FSR'] = 0.75
        continue
        
    if N_kids == 1 and len(family) == 1:
        DATA.loc[DATA.index[i],'SG'] = "solo_kid"
         # based on actual ratio
        DATA.loc[DATA.index[i],'FSR'] = 0.62
        continue
    
    wc_group = family[family.Title != 'Mr']
    survived = wc_group[wc_group.Survived != -1]['Survived'].to_list()
    
    if len(survived) == 0:
        # in memoriam 'peacock family', is this cheating?
        DATA.loc[DATA.index[i],'SG'] = "group_died"
        DATA.loc[DATA.index[i],'FSR'] = 0
        continue
    
    if np.mean(survived) >= 0.5:
        DATA.loc[DATA.index[i],'SG'] = "group_survived"
    else:
        DATA.loc[DATA.index[i],'SG'] = "group_died"
        
    DATA.loc[DATA.index[i],'FSR'] = np.mean(survived)

In [None]:
ax = sns.countplot(data = DATA[DATA.Set == 'Train'], x = 'SG', hue = "Survived");
ax.set_title("SG");

In [None]:
DATA[(DATA.Set == 'Train') & (DATA.SG == 'no_children')]['Survived'].value_counts()/len(DATA[(DATA.Set == 'Train') & (DATA.SG == 'no_children')]['Survived'])

In [None]:
DATA[(DATA.Set == 'Train') & (DATA.SG == 'solo_kid')]['Survived'].value_counts()/len(DATA[(DATA.Set == 'Train') & (DATA.SG == 'solo_kid')]['Survived'])

In [None]:
DATA[(DATA.Set == 'Train') & (DATA.SG == 'adult_male')]['Survived'].value_counts()/len(DATA[(DATA.Set == 'Train') & (DATA.SG == 'adult_male')]['Survived'])

## Pclass

In [None]:
g = sns.catplot(x="Pclass",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability")

## Fare
* fare needs to be divided with family size
* some values are zero, one is missing, i will fill this with median of the corresponding PClass
* distribution is skewed, I will apply log

In [None]:
DATA['RealFare'] = DATA.Fare / DATA.FamilySize

index_zero = list(DATA['RealFare'][(DATA['RealFare'] == 0) | (DATA['Fare'].isna())].index)
for i in index_zero:
    med_fare = DATA['RealFare'][(DATA.Pclass == DATA.iloc[i]['Pclass'])].median()
    DATA.loc[DATA.index[i],'RealFare'] = med_fare

In [None]:
sns.histplot(data = DATA[DATA.Set == 'Train'], x='RealFare', stat='percent', hue='Pclass', kde=True, log_scale=True);

### Fare to Bins

In [None]:
DATA['RealFare'].describe()
DATA['FareBins'] = pd.cut(DATA['RealFare'], [0, 5,10, 25, 50, 100, np.inf], labels = ['0-5', '5-10', '10-25', '25-50', '50-100', '>100'], include_lowest = True)
DATA['FareBins'].value_counts()

In [None]:
sns.countplot(data = DATA[DATA.Set == 'Train'], x='FareBins',  hue='Survived');

In [None]:
sns.histplot(data = DATA[DATA.Set == 'Train'], x='RealFare', stat='percent', hue='Survived', kde=True, log_scale=True);

In [None]:
DATA['RealFare'] = DATA['RealFare'].apply(lambda row: np.log(row) if row > 0 else 0)

## Female, no children, pclass=3

In [None]:
def extractData(sex,fare,sg):
    if sex != "female": return 0
    if sg != 'no_children': return 0
    if fare > 10: return 0
    return 1

DATA['FemaleClass3Nochildren'] = DATA[['Sex','RealFare','SG']].apply(lambda row: extractData(*row), axis=1)
#DATA['FemaleClass3Nochildren'].value_counts()

In [None]:
g = sns.catplot(x="FemaleClass3Nochildren",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Female, no children pclass 3")

## Cabin -> Deck
* some cabin data can be aquired from othe family members, but only in two cases, it is not worth it
* so X if person does not have cabin

In [None]:
def get_deck(cabin):
    if cabin is np.nan or cabin.startswith('T'):
        return 'X'
    else:
        return cabin[0]
    
DATA.Cabin = DATA.Cabin.apply(get_deck)

In [None]:
g = sns.countplot(x="Cabin",data=DATA[DATA.Set == 'Train'],palette = "muted");

In [None]:
g = sns.catplot(x="Cabin",y="Survived",data=DATA[DATA.Set == 'Train'], kind="bar", height = 6, palette = "muted")
g = g.set_ylabels("survival probability - Cabin")

## Remaining numeric features

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
ax = sns.heatmap(DATA[DATA.Set == 'Train'][['Survived',"SibSp","Parch", "FSR","Age","RealFare","FamilySize","SexN"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm");
ax.set_title("Survival correlation to numeric features");

### Scale numeric features

In [None]:
#mm = MinMaxScaler()
mm = StandardScaler()
scale_features = ["SibSp","Parch", "Age","RealFare","FamilySize"]
DATA[scale_features] = mm.fit_transform(DATA[scale_features])

## PCA

In [None]:
pca_features = ['Age',"RealFare","FamilySize", "FSR", "SexN", 'AdultMan']
pca = PCA(3)
X_PCA = pca.fit_transform(DATA.loc[:, pca_features])
component_names = [f"PC{i+1}" for i in range(X_PCA.shape[1])]
X_PCA = pd.DataFrame(X_PCA, columns=component_names)
X_PCA.head()

In [None]:
DATA_PCA = X_PCA.copy()
DATA_PCA['Survived'] = DATA['Survived']
DATA_PCA=DATA_PCA[DATA_PCA['Survived'] != -1]
pca.explained_variance_ratio_

In [None]:
%%time
def scatterplot(x,y,**kwargs):
    sns.regplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA_PCA, id_vars=['Survived'], value_vars=component_names)
g = sns.FacetGrid(f, col="variable",  col_wrap=4, sharex=False, sharey=True, height=5)
g = g.map(scatterplot, "value", "Survived")

In [None]:
fig, axs = plt.subplots(1, 2)
n = pca.n_components_
grid = np.arange(1, n + 1)
# Explained variance
evr = pca.explained_variance_ratio_
axs[0].bar(grid, evr)
axs[0].set(xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0))
# Cumulative Variance
cv = np.cumsum(evr)
axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
axs[1].set(xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0))
# Set up figure
fig.set(figwidth=8, dpi=100);

In [None]:
loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=DATA.loc[:, pca_features].columns,  # and the rows are the original features
    )
loadings

## Clustering

In [None]:
%%time
clustering_features = ['Age',"RealFare","FamilySize", "FSR", "SexN", 'AdultMan']

kmeans = KMeans(n_clusters = 2, random_state=13)
clust_data = DATA[DATA.Set == 'Train'].loc[:, clustering_features]
clust_data['cluster'] = kmeans.fit_predict(clust_data)
clust_data['cluster'] = clust_data['cluster'].astype('category')
clust_data['Survived'] = DATA[DATA.Set == 'Train']['Survived']

In [None]:
%%time
sns.relplot(data = clust_data.melt(value_vars=clustering_features, id_vars = ["Survived", "cluster"]), x="value", y="Survived", hue='cluster', col= "variable", col_wrap=8, height=3);

In [None]:
clust_data = DATA.loc[:, clustering_features]
X_CD = kmeans.fit_transform(clust_data)
X_CD = mm.fit_transform(X_CD)
X_CD = pd.DataFrame(X_CD, columns=[f"Centroid_{i}" for i in range(X_CD.shape[1])])

## Target Encoding

In [None]:
encode_features = ['Sex','AdultMan','FemaleClass3Nochildren',"Cabin",'FareBins','Pclass','FamilySizeCategory','Age_Cat','Embarked',"SG"]
X_encode = DATA[DATA.Set == 'Train'].sample(frac=0.25, random_state=13)
y_encode = X_encode.pop("Survived")
X_encode[encode_features].head()

In [None]:
encoder = MEstimateEncoder(cols=encode_features,m=1.5)
enc_cols = ["TE_"+ f for f in encode_features]
encoder.fit(X_encode, y_encode)
ENC = encoder.transform(DATA.drop("Survived", axis=1))
X_ENC = ENC[encode_features]
X_ENC.columns = enc_cols;
X_ENC.head()

# Dropping & Encoding features

### Save features for review

In [None]:
_Embarked = DATA[DATA.Set == 'Train'].Embarked
_PClass = DATA[DATA.Set == 'Train'].Pclass
_Cabin = DATA[DATA.Set == 'Train'].Cabin
_Title = DATA[DATA.Set == 'Train'].Title
_Family = DATA[DATA.Set == 'Train'].FamilySizeCategory
_Sex = DATA[DATA.Set == 'Train'].Sex
_Age = DATA[DATA.Set == 'Train'].Age
_Fare = DATA[DATA.Set == 'Train'].FareBins
_SG = DATA[DATA.Set == 'Train'].SG
_FSR = DATA[DATA.Set == 'Train'].FSR

#train
T_Embarked = DATA[DATA.Set == 'Test'].Embarked
T_PClass = DATA[DATA.Set == 'Test'].Pclass
T_Cabin = DATA[DATA.Set == 'Test'].Cabin
T_Title = DATA[DATA.Set == 'Test'].Title
T_Family = DATA[DATA.Set == 'Test'].FamilySizeCategory
T_Sex = DATA[DATA.Set == 'Test'].Sex
T_Age = DATA[DATA.Set == 'Test'].Age
T_Fare = DATA[DATA.Set == 'Test'].FareBins
T_SG = DATA[DATA.Set == 'Test'].SG
T_FSR = DATA[DATA.Set == 'Test'].FSR

## Dropping
* drop selected in this run and see how the models behave
* some features were explored but ultimately not used,as they are duplicates of other feature

In [None]:
drop_this_run = ["SibSp","Parch","FamilySize"] #does not contribute

In [None]:
redundant_features = ['Ticket', "Name",'Surname','Fare', "Sex"]

DATA.drop([*redundant_features,*drop_this_run], inplace = True, axis = 1)
#DATA.dtypes

### Categorical to dummies

In [None]:
categorical = ['Cabin','Pclass','FamilySizeCategory','Title','Embarked', "SG", "Age_Cat", 'FareBins']
categorical = [cat for cat in categorical if cat not in drop_this_run]
DATA = pd.get_dummies(DATA,columns=categorical)

## Join

In [None]:
DATA = DATA.join([X_CD,X_PCA, X_ENC])

## Features

In [None]:
fig, ax = plt.subplots(figsize=(20,20))     
g = sns.heatmap(DATA[DATA.Set == 'Train'].corr(),annot=False, fmt = ".2f", cmap = "coolwarm")

---

# Split the data

In [None]:
TRAIN = DATA[DATA.Set == 'Train']
TEST = DATA[DATA.Set == 'Test']
PassengerIds = TEST.PassengerId.to_list()
TEST = TEST.drop(['PassengerId','Set',"Survived",'index'], axis = 1)
y = TRAIN.Survived
X = TRAIN.drop(['Survived','PassengerId','Set','index'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 13, stratify=y)

In [None]:
X.head()

In [None]:
TEST.head()

---

# Models

In [None]:
def plot_CM(model, clf):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6,6))
    fig.suptitle(clf)
    cm_train=confusion_matrix(y_train, model.predict(X_train), normalize = 'pred', labels = model.classes_)
    disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=model.classes_)
    disp_train.plot(ax=ax1)
    disp_train.ax_.set_title('Train')
    cm_test=confusion_matrix(y_test, model.predict(X_test), normalize = 'pred', labels = model.classes_)
    disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=model.classes_)
    disp_test.plot(ax=ax2)
    disp_test.ax_.set_title('Test')
    plt.show()

## DNN

In [None]:
def plot_loss(loss,val_loss):
    plt.figure()
    plt.plot(loss)
    plt.plot(val_loss)
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.show()

def plot_accuracy(acc,val_acc):
    plt.figure()
    plt.plot(acc)
    plt.plot(val_acc)
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show() 

In [None]:
dnn_model = Sequential()
n_cols = X.shape[1]
dnn_model.add(Input(shape = (n_cols,), name = 'input'))
dnn_model.add(Dense(1024, activation="relu"))
dnn_model.add(Dropout(0.25))
dnn_model.add(Dense(512, activation="relu"))
dnn_model.add(Dropout(0.25))
dnn_model.add(Dense(256, activation="relu"))
dnn_model.add(Dropout(0.2))
dnn_model.add(Dense(128, activation="relu"))
dnn_model.add(Dropout(0.2))
dnn_model.add(Dense(64, activation="relu"))
dnn_model.add(Dropout(0.2))
dnn_model.add(Dense(32, activation="relu"))
dnn_model.add(Dropout(0.2))
dnn_model.add(Dense(16, activation="relu"))
#dnn_model.add(BatchNormalization())
dnn_model.add(Dropout(0.1))
dnn_model.add(Dense(2, activation="relu"))
dnn_model.add(BatchNormalization())
dnn_model.add(Dense(1, activation="sigmoid", name='out'))            
dnn_model.summary()

In [None]:
tf.keras.utils.plot_model(dnn_model, show_shapes=True)

In [None]:
%%time
#dnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
dnn_model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-03), metrics=['binary_accuracy'])
early_stopping_monitor = EarlyStopping(patience=25, monitor='val_binary_accuracy')
checkpoint = ModelCheckpoint("weights.hdf5", monitor = 'val_binary_accuracy', save_best_only = True)
#dnn_model.fit(X_train,y_train, validation_data=(X_test,y_test), callbacks=[checkpoint, early_stopping_monitor], epochs=300, batch_size=64, verbose=0, validation_split=0.25)
dnn_model.fit(X_train,y_train, validation_data=(X_test,y_test), callbacks=[checkpoint, early_stopping_monitor], epochs=300, batch_size=96, verbose=0, validation_split=0.25)
dnn_model.load_weights("weights.hdf5")

plot_loss(dnn_model.history.history['loss'], dnn_model.history.history['val_loss'])
plot_accuracy(dnn_model.history.history['binary_accuracy'], dnn_model.history.history['val_binary_accuracy'])

_, train_dnn_accuracy = dnn_model.evaluate(X_train, y_train)
_, dnn_accuracy = dnn_model.evaluate(X_test, y_test)
print('Train accuracy: {:.2f} %'.format(train_dnn_accuracy*100))
print('Accuracy: {:.2f} %'.format(dnn_accuracy*100))
print('Overfit: {:.2f} % '.format((train_dnn_accuracy - dnn_accuracy)*100))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6,6))
fig.suptitle('DNN CM')
cm_train=confusion_matrix(y_train, np.rint(dnn_model.predict(X_train)), normalize = 'pred', labels =[0,1])
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=[0,1])
disp_train.plot(ax=ax1)
disp_train.ax_.set_title('Train')
cm_test=confusion_matrix(y_test, np.rint(dnn_model.predict(X_test)), normalize = 'pred', labels = [0,1])
disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=[0,1])
disp_test.plot(ax=ax2)
disp_test.ax_.set_title('Test')
plt.show()

## Random Forest

In [None]:
# %%time
# clf = RandomForestClassifier(random_state = 13, n_jobs=-1)
# param_grid = {'n_estimators': [25, 50,75,150,300, 500],'max_depth': [*range(3,13), None], 'max_features': [*np.arange(0.5,1.0,0.1),'auto','sqrt',"log2"],
#              'bootstrap': [True]}
# rf_grid_clf = GridSearchCV(clf, param_grid, cv=4, scoring= "accuracy")
# rf_grid_clf.fit(X, y)
# print(rf_grid_clf.best_estimator_)
# print(rf_grid_clf.best_params_)

# rf_accuracy = rf_grid_clf.best_score_
# print(rf_accuracy)

In [None]:
rf_model = RandomForestClassifier(max_depth=11, n_estimators=50, n_jobs=-1, random_state=13)
rf_model.fit(X_train, y_train)
rf_train_score = rf_model.score(X_train, y_train)
rf_accuracy = rf_model.score(X_test, y_test)
print("Train: {:.2f} %".format(rf_train_score * 100))
print("Test: {:.2f} %".format(rf_accuracy*100))
print('Overfit: {:.2f} %'.format((rf_train_score-rf_accuracy)*100))

In [None]:
plot_CM(rf_model, "Random Forest")

In [None]:
features = {}
for feature, importance in zip(X_train.columns, rf_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"RF":features})
importances.sort_values("RF", ascending = False, inplace=True)
RF_best_features = list(importances[importances.RF > 0.03].index)
importances.plot.bar()
print("RF_best_features:",RF_best_features, len(RF_best_features))

## XGBoost

In [None]:
# %%time
# xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state = 13)
# xgb_param_grid = {'max_depth': [*range(4, 10), None],
#                   'learning_rate': [0.001, 0.01, 0.1, 0.05],
#                   'subsample': np.arange(0.6,1.0,0.1),
#                   'colsample_bytree': np.arange(0.2,1.0,0.1),
#                   'reg_alpha':[0.001, 0.01, 0.1],
#                   'reg_lambda': [0.01, 1.0, 10.0, 100.0, 1000, 10000],
#                   'n_estimators': [50, 100, 250, 500, 1000]
#                  }

# #xgb_grid = GridSearchCV(estimator=xgb_clf, param_grid = xgb_param_grid, cv=4, scoring= "accuracy")
# xgb_grid = RandomizedSearchCV(estimator=xgb_clf, param_distributions = xgb_param_grid, cv=4, scoring= "accuracy", random_state = 13)
# xgb_grid.fit(X,y)

# print(xgb_grid.best_params_)
# xgb_accuracy = xgb_grid.best_score_
# print(xgb_accuracy)

In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', learning_rate=0.1, colsample_bytree = 0.4, subsample= 0.9, reg_lambda= 1000, reg_alpha = 0.01, n_estimators= 100,
                             max_depth = None)

xgb_model.fit(X_train, y_train)
xgb_train_score = xgb_model.score(X_train, y_train)
xgb_accuracy = xgb_model.score(X_test, y_test)
print("Train: {:.2f} %".format(xgb_train_score*100))
print("Test: {:.2f} %".format(xgb_accuracy*100))
print('Overfit: {:.2f} %'.format((xgb_train_score-xgb_accuracy)*100))

In [None]:
plot_CM(xgb_model, "XGB")

In [None]:
weights = xgb_model.get_booster().get_score(importance_type="gain")
weights = [(weights[w],w) for w in sorted(weights, key=weights.get, reverse=True)]
XGB_features = [w[1] for w in weights]
print("XGB best features", XGB_features)

## SVM

In [None]:
# %%time
# param_grid = {'C': [1, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf']}
# SVM_grid = GridSearchCV(estimator = SVC(),param_grid=param_grid, cv=5, scoring= "accuracy")
# SVM_grid.fit(X,y)
# print(SVM_grid.best_params_)
# SVM_accuracy = SVM_grid.best_score_
# print(SVM_accuracy)

In [None]:
SVM_model = SVC(C = 100, gamma= 0.001, kernel='rbf', probability=True, random_state = 13)
SVM_model.fit(X_train, y_train)
svm_train_score = SVM_model.score(X_train, y_train)
SVM_accuracy = SVM_model.score(X_test, y_test)
print("Train: {:.2f} %".format(svm_train_score*100))
print("Test: {:.2f} %".format(SVM_accuracy*100))
print('Overfit: {:.2f} %'.format((svm_train_score - SVM_accuracy)*100))

In [None]:
plot_CM(SVM_model, "SVM")

## Logistic Regression

In [None]:
# %%time
# param_grid = {'C': np.logspace(-4, 4, 10), 'penalty': ['l2','l1'], 'solver': ['liblinear']}
# #LR_grid = GridSearchCV(estimator = LogisticRegression(solver='liblinear'), param_grid=param_grid, cv=4, scoring= "accuracy", random_state = 13)
# LR_grid = GridSearchCV(estimator = LogisticRegression(random_state = 13), param_grid=param_grid, cv=4, scoring= "accuracy", )
# LR_grid.fit(X,y)
# print(LR_grid.best_params_)
# LR_accuracy = LR_grid.best_score_
# print(LR_accuracy)

In [None]:
LR_model = LogisticRegression(solver='liblinear', C=0.005994842503189409, penalty='l2', random_state = 13)

LR_model.fit(X_train, y_train)
LR_train_score = LR_model.score(X_train, y_train)
LR_accuracy = LR_model.score(X_test, y_test)
print("Train: {:.2f} %".format(LR_train_score*100))
print("Test: {:.2f} %".format(LR_accuracy*100))
print('Overfit: {:.2f} %'.format((LR_train_score-LR_accuracy)*100))

In [None]:
plot_CM(LR_model, "Logistic Regression")

## KNN

In [None]:
# grid_params = {'n_neighbors': range(2,20), 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
# KNN_grid = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = grid_params, cv=5, scoring= "accuracy")
# KNN_grid.fit(X,y)
# print(KNN_grid.best_params_)
# KNN_accuracy = KNN_grid.best_score_
# print(KNN_accuracy)

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=12,metric='manhattan',weights='uniform')
KNN_model.fit(X_train, y_train)
KNN_train_score = KNN_model.score(X_train, y_train)
KNN_accuracy = KNN_model.score(X_test, y_test)
print("Train: {:.2f} %".format(KNN_train_score*100))
print("Test: {:.2f} %".format(KNN_accuracy*100))
print('Overfit: {:.2f} %'.format((KNN_train_score-KNN_accuracy)*100))

In [None]:
plot_CM(KNN_model, "KNN")

## ADA Boost

In [None]:
# %%time
# abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
# parameters = {'base_estimator__max_depth':[3, 4, 10, None],
#               'base_estimator__min_samples_split':range(3,10),
#               'base_estimator__min_samples_leaf':range(3,10),
#               'base_estimator__max_features':['auto','sqrt',0.75, 0.9],
#               'n_estimators':[50, 100,250,500],
#               'learning_rate':[0.001, 0.01, 0.1, 0.5]}

# ADA_grid = RandomizedSearchCV(estimator=abc, param_distributions = parameters, cv=4, scoring= "accuracy", random_state = 13, verbose=0)
# ADA_grid.fit(X,y)
# print(ADA_grid.best_params_)
# ADA_accuracy = ADA_grid.best_score_
# print(ADA_accuracy)

In [None]:
ADA_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3, min_samples_leaf=6, max_features='auto', min_samples_split= 3), 
                               n_estimators=250, learning_rate=0.001, random_state=13) 
ADA_model.fit(X_train,y_train)
ADA_train_score = ADA_model.score(X_train, y_train)
ADA_accuracy = ADA_model.score(X_test, y_test)
print("Train: {:.2f} %".format(ADA_train_score*100))
print("Test: {:.2f} %".format(ADA_accuracy*100))
print('Overfit: {:.2f} %'.format((ADA_train_score - ADA_accuracy)*100))

In [None]:
plot_CM(ADA_model, "ADA Boost")

In [None]:
features = {}
for feature, importance in zip(X_train.columns, ADA_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"ADA":features})
importances.sort_values("ADA", ascending = False, inplace=True)
importances
ADA_best_features = list(importances[importances.ADA > 0.03].index)
importances.plot.bar()
print("ADA_best_features:",ADA_best_features, len(ADA_best_features))

## ExtraTrees

In [None]:
# %%time
# ExtC = ExtraTreesClassifier(random_state = 13)
# ex_param_grid = {"max_depth": [*range(3,10),None],
#               'max_features':['auto',0.9, 0.8, 0.75,'sqrt', 'log2'],
#               "n_estimators" :[50,100,300, 500],
#                 'bootstrap': [False, True],
#                 "min_samples_split": range(2,10),
#                  "min_samples_leaf": range(1,10),
#                  'criterion': ['gini', 'entropy']
#                 }

# gsExtC = RandomizedSearchCV(estimator=ExtC, param_distributions = ex_param_grid, cv=4, scoring= "accuracy", random_state = 13, verbose=0)
# gsExtC.fit(X,y)
# print(gsExtC.best_estimator_)
# ExtC_accuracy = gsExtC.best_score_
# print(ExtC_accuracy)

In [None]:
ETC_model = ExtraTreesClassifier(bootstrap=True, max_depth=8, max_features=0.9,
                     min_samples_leaf=7, min_samples_split=8, n_estimators=300,
                     random_state=13)
ETC_model.fit(X_train, y_train)
ETC_train_score = ETC_model.score(X_train, y_train)
ETC_accuracy = ETC_model.score(X_test, y_test)
print("Train: {:.2f} %".format(ETC_train_score*100))
print("Test: {:.2f} %".format(ETC_accuracy*100))
print('Overfit: {:.2f} %'.format((ETC_train_score-ETC_accuracy)*100))

In [None]:
plot_CM(ETC_model, "Extra Trees")

In [None]:
features = {}
for feature, importance in zip(X_train.columns, ETC_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"ETC":features})
importances.sort_values("ETC", ascending = False, inplace=True)
importances
ETC_best_features = list(importances[importances.ETC > 0.03].index)
importances.plot.bar()
print("ETC_best_features:",ETC_best_features, len(ETC_best_features))

## Gradient Boost (GBC)

In [None]:
# %%time
# GBC = GradientBoostingClassifier(random_state = 13)
# gbc_param_grid = {
#               'n_estimators' : [25, 50, 100,250, 500, 1000],
#               'learning_rate': [0.5, 0.1, 0.05, 0.01, 0.001],
#               'max_depth': [*range(3,10,1), None],
#               'max_features': [*np.arange(0.3,1.0,0.1),'auto','sqrt',"log2"]
#               }

# gsGBC = RandomizedSearchCV(estimator=GBC, param_distributions = gbc_param_grid, cv=4, scoring= "accuracy", random_state = 13, verbose=0)
# gsGBC.fit(X,y)
# print(gsGBC.best_estimator_)
# gbc_accuracy = gsGBC.best_score_
# print(gbc_accuracy)

In [None]:
GBC_model = GradientBoostingClassifier(learning_rate=0.001, max_depth=8, max_features=0.3, n_estimators=500, random_state=13)
GBC_model.fit(X_train, y_train)
GBC_train_score = GBC_model.score(X_train, y_train)
GBC_accuracy = GBC_model.score(X_test, y_test)
print("Train: {:.2f} %".format(GBC_train_score*100))
print("Test: {:.2f} %".format(GBC_accuracy*100))
print('Overfit: {:.2f} %'.format((GBC_train_score - GBC_accuracy)*100))

In [None]:
plot_CM(GBC_model, "GradientBoosting")

In [None]:
features = {}
for feature, importance in zip(X_train.columns, GBC_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"GBC":features})
importances.sort_values("GBC", ascending = False, inplace=True)
GBC_best_features = list(importances[importances.GBC > 0.03].index)
importances.plot.bar()
print("GBC_best_features:",GBC_best_features, len(GBC_best_features))

## SGD Classifier

In [None]:
# %%time
# params = {
#     "loss" : ["modified_huber"],
#     "alpha" : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
#     "penalty": ['l2', 'l1', 'elasticnet'],
#     "l1_ratio": np.arange(0.0, 1.0, 0.05),
#     "epsilon": [0.1, 0.01, 1]
# }

# sgd_grid = GridSearchCV(estimator =  SGDClassifier(max_iter=1000, random_state=13, n_jobs=-1), param_grid=params, cv=4, scoring="accuracy")
# sgd_grid.fit(X,y)

# print(sgd_grid.best_estimator_)
# sgd_accuracy = sgd_grid.best_score_
# print(sgd_accuracy)

In [None]:
SGD_model = SGDClassifier(alpha=1, l1_ratio=0.0, loss='modified_huber', n_jobs=-1,random_state=13)

SGD_model.fit(X_train, y_train)
SGD_train_score = SGD_model.score(X_train, y_train)
SGD_accuracy = SGD_model.score(X_test, y_test)
print("Train: {:.2f} %".format(SGD_train_score*100))
print("Test: {:.2f} %".format(SGD_accuracy*100))
print('Overfit: {:.2f} %'.format((SGD_train_score-SGD_accuracy)*100))

In [None]:
plot_CM(SGD_model, "StochasticGradientDescent")

## Decision Tree

In [None]:
# %%time
# params = {
#     "criterion": ['gini', 'entropy'],
#     "splitter":['best', 'random'],
#     "max_depth": [*range(3,12), None],
#     'max_features': [*np.arange(0.3,1.0,0.1),'auto','sqrt',"log2"]
# }

# dt_grid = RandomizedSearchCV(estimator =  DecisionTreeClassifier(random_state=13), param_distributions=params, cv=4, scoring="accuracy")
# dt_grid.fit(X,y)

# print(dt_grid.best_estimator_)
# sgd_accuracy = dt_grid.best_score_
# print(sgd_accuracy)

In [None]:
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features=0.5,
                       random_state=13)

DT_model.fit(X_train, y_train)
DT_train_score = DT_model.score(X_train, y_train)
DT_accuracy = DT_model.score(X_test, y_test)
print("Train: {:.2f} %".format(DT_train_score*100))
print("Test: {:.2f} %".format(DT_accuracy*100))
print('Overfit: {:.2f} %'.format((DT_train_score - DT_accuracy)*100))

In [None]:
plot_CM(DT_model, "DecisionTreeClassifier")

In [None]:
features = {}
for feature, importance in zip(X_train.columns, DT_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"DT":features})
importances.sort_values("DT", ascending = False, inplace=True)
DT_best_features = list(importances[importances.DT > 0.03].index)
importances.plot.bar()
print("DT_best_features:",DT_best_features, len(DT_best_features))

## LGBM

In [None]:
# %%time
# lgbm_clf = lgb.LGBMClassifier(random_state=13)
# lgbm_param_grid = {
#                 'max_depth': [4, 10, None],
#                   'learning_rate': [0.001, 0.01, 0.1, 0.05],
#                   'subsample': np.arange(0.7, 0.95, 0.05),
#                   'colsample_bytree': np.arange(0.5, 1.0, 0.1),
#                   'reg_alpha':[0.001, 0.01, 0.1],
#                   'reg_lambda': [0.01, 1.0, 10.0, 100.0, 1000, 10000],
#                   'n_estimators': [50, 100, 250, 500, 1000]
#                  }

# lgbm_grid = RandomizedSearchCV(estimator=lgbm_clf, param_distributions = lgbm_param_grid, cv=4, scoring= "accuracy", random_state = 13)
# lgbm_grid.fit(X,y)

# print(lgbm_grid.best_params_)
# lgbm_accuracy = lgbm_grid.best_score_
# print(lgbm_accuracy)

In [None]:
%%time
LGBM_model = lgb.LGBMClassifier(random_state=13, subsample=0.9, reg_lambda=100, reg_alpha=0.01, n_estimators=250, max_depth=None, learning_rate=0.05, colsample_bytree=0.6)
LGBM_model.fit(X_train, y_train)
LGBM_train_score = LGBM_model.score(X_train, y_train)
LGBM_accuracy = LGBM_model.score(X_test, y_test)
print("Train: {:.2f} %".format(LGBM_train_score*100))
print("Test: {:.2f} %".format(LGBM_accuracy*100))
print('Overfit: {:.2f} %'.format((LGBM_train_score - LGBM_accuracy)*100))

In [None]:
plot_CM(LGBM_model, "LGBM Classifier")

In [None]:
features = {}
for feature, importance in zip(X_train.columns, LGBM_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"LGBM":features})
importances.sort_values("LGBM", ascending = False, inplace=True)
LGBM_best_features = list(importances[importances.LGBM > 0.03].index)
importances.plot.bar()
print("LGBM_best_features:",LGBM_best_features, len(LGBM_best_features))

### Top X features

In [None]:
L = min(len(RF_best_features), len(XGB_features), len(ADA_best_features),  
        len(ETC_best_features), len(GBC_best_features),  len(DT_best_features), len(LGBM_best_features))

TF = pd.DataFrame({"ADA":ADA_best_features[:L], 
                   "XGB":XGB_features[:L], "RF":RF_best_features[:L],
                  "ETC":ETC_best_features[:L], "GBC":GBC_best_features[:L], 
                  "DT": DT_best_features[:L], "LGBM": LGBM_best_features[:L]} )
TF

## Scores

In [None]:
print("Accuracy Scores:")
print("==========================================================")
print("DNN: {:.3f}".format(dnn_accuracy))
print("RandomForest: {:.3f}".format(rf_accuracy))
print("XGBoost classifier: {:.3f}".format(xgb_accuracy))
print("SVM classifier: {:.3f}".format(SVM_accuracy))
print("LR classifier: {:.3f}".format(LR_accuracy))
print("KNN classifier: {:.3f}".format(KNN_accuracy))
print("ADA Boost classifier: {:.3f}".format(ADA_accuracy))
print("Extra Tree classifier: {:.3f}".format(ETC_accuracy))
print("Gradient Boosting classifier: {:.3f}".format(GBC_accuracy))
print("Stochastic Gradient descent: {:.3f}".format(SGD_accuracy))
print("Decision Tree classifier: {:.3f}".format(DT_accuracy))
print("LGBM classifier: {:.3f}".format(LGBM_accuracy))
print("==========================================================")

---

# Predictions

In [None]:
class DNN_wrapper:
    def __init__(self, model):
        self.model = model
    def predict(self, df):
        return np.rint(self.model.predict(df))[:,0]
    def predict_proba(self, df):
        probs = self.model.predict(df)
        probs2 = np.ones_like(probs) - probs
        packed = np.concatenate((probs2, probs), axis=1)        
        return packed
    
DNN_model = DNN_wrapper(dnn_model)

In [None]:
models = [xgb_model, rf_model, SVM_model, LR_model, KNN_model, ADA_model, ETC_model, GBC_model, SGD_model, DT_model, LGBM_model, DNN_model]
model_names = ["XGB", "RF", "SVM", "LR", "KNN", "ADA", "ETC", "GBC", "SGD", "DT", "LGBM", "DNN"]
print("using", len(models), "classifiers")

---

### Treshold

In [None]:
#SVC
#TRESHOLD = 0.50001 #sub score 0.79425
#TRESHOLD = 0.549 #sub score 0.79904
#TRESHOLD = 0.6 #sub score 0.79904
#TRESHOLD = 0.6 #with standard scaler, sub score 0.79665
TRESHOLD = 0.65 #sub score 0.80861

HVC_TRESHOLD = 0.500001

----------------

# Hard Voting Classifier - VC

* equal weights

In [None]:
%%time
ALL_PREDICTIONS = pd.DataFrame({'PassengerId': PassengerIds})
for i, m in enumerate(models):
    ALL_PREDICTIONS[model_names[i]] = m.predict(TEST)
ALL_PREDICTIONS['Vote'] = ALL_PREDICTIONS[model_names].mean(axis=1)
ALL_PREDICTIONS['Predict'] = ALL_PREDICTIONS.Vote.apply(lambda row: 1 if row > HVC_TRESHOLD else 0)
vc_predictions = ALL_PREDICTIONS.Predict
ALL_PREDICTIONS.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))  
g= sns.heatmap(ALL_PREDICTIONS[model_names].corr(),annot=True)

## Soft Voting - SVC

In [None]:
%%time


SVC_ALL_PREDICTIONS = pd.DataFrame({'PassengerId': PassengerIds})
for i, m in enumerate(models):
    SVC_ALL_PREDICTIONS[model_names[i]] = m.predict_proba(TEST)[:,1]
SVC_ALL_PREDICTIONS['MedianVote'] = SVC_ALL_PREDICTIONS[model_names].median(axis=1)
SVC_ALL_PREDICTIONS['SoftVote'] = SVC_ALL_PREDICTIONS[model_names].mean(axis=1)
SVC_ALL_PREDICTIONS['Predict'] = SVC_ALL_PREDICTIONS.SoftVote.apply(lambda row: 1 if row > TRESHOLD else 0)
svc_predictions = SVC_ALL_PREDICTIONS.Predict
SVC_ALL_PREDICTIONS.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))  
g= sns.heatmap(SVC_ALL_PREDICTIONS[model_names].corr(),annot=True)

## Prediction comparison

In [None]:
COMP_PREDICTIONS = pd.DataFrame({'PassengerId': PassengerIds})
COMP_PREDICTIONS['HVC'] = vc_predictions
COMP_PREDICTIONS['SVC'] = svc_predictions
COMP_PREDICTIONS.head(20)

## Checking Train Scores - HVC

In [None]:
TRAIN_PREDICTIONS = pd.DataFrame({'Survived':train_data.Survived, 'Fare':_Fare, "Title": _Title,"PClass": _PClass, })
for i, m in enumerate(models):
    TRAIN_PREDICTIONS[model_names[i]] = m.predict(X)
TRAIN_PREDICTIONS['Vote'] = TRAIN_PREDICTIONS[model_names].mean(axis=1)
TRAIN_PREDICTIONS['VC'] = TRAIN_PREDICTIONS.Vote.apply(lambda row: 1 if row > HVC_TRESHOLD else 0)
wrong = TRAIN_PREDICTIONS[TRAIN_PREDICTIONS.Survived != TRAIN_PREDICTIONS.VC]
wrong[(wrong.Vote >= 0.4) & (wrong.Vote <= 0.6)].head(10)

## Checking Train Scores - SVC

In [None]:
SVC_TRAIN_PREDICTIONS = pd.DataFrame({'Survived':train_data.Survived, 'Fare':_Fare, "Title": _Title,"PClass": _PClass, "Sex":_Sex})
for i, m in enumerate(models):
    SVC_TRAIN_PREDICTIONS[model_names[i]] = m.predict_proba(X)[:,1]
    
SVC_TRAIN_PREDICTIONS['MedianVote'] = SVC_TRAIN_PREDICTIONS[model_names].median(axis=1)
SVC_TRAIN_PREDICTIONS['SoftVote'] = SVC_TRAIN_PREDICTIONS[model_names].mean(axis=1)
SVC_TRAIN_PREDICTIONS['SVC'] = SVC_TRAIN_PREDICTIONS.SoftVote.apply(lambda row: 1 if row > TRESHOLD else 0)

wrong = SVC_TRAIN_PREDICTIONS[SVC_TRAIN_PREDICTIONS.Survived != SVC_TRAIN_PREDICTIONS.SVC]
WS = wrong[(wrong.SoftVote >= 0.35) & (wrong.SoftVote <= 0.65) & (wrong.Survived == 1)].sort_values("SoftVote", ascending=False)
WD = wrong[(wrong.SoftVote >= 0.35) & (wrong.SoftVote <= 0.65) & (wrong.Survived == 0)].sort_values("SoftVote", ascending=True)

In [None]:
WS.head(10)

In [None]:
WD.head(20)

---

In [None]:
train_scores = dict()
for clf in [*model_names, 'VC']:
    train_scores[clf] = [len(TRAIN_PREDICTIONS[TRAIN_PREDICTIONS.Survived == TRAIN_PREDICTIONS[clf]]) / TRAIN_PREDICTIONS.shape[0]]

TRAIN_SCORES = pd.DataFrame(train_scores)
TRAIN_SCORES

In [None]:
TRAIN_SCORES.plot.bar();

In [None]:
# don't forget to enter best predictions ...
#output = pd.DataFrame({'PassengerId': PassengerIds, 'Survived': xgb_predictions})
#output = pd.DataFrame({'PassengerId': PassengerIds, 'Survived': ada_predictions})
# output = pd.DataFrame({'PassengerId': PassengerIds, 'Survived': dnn_predictions})
#output = pd.DataFrame({'PassengerId': PassengerIds, 'Survived': vc_predictions})
#output = pd.DataFrame({'PassengerId': PassengerIds, 'Survived': rf_predictions})
output = pd.DataFrame({'PassengerId': PassengerIds, 'Survived': svc_predictions})
#svc_predictions

output.head(10)

In [None]:
#output
output.to_csv('submission.csv', index=False)
print("Submission was successfully saved!")

In [None]:
end_time = time.time()
print("Notebook run time: {:.1f} seconds. Finished at {}".format(end_time - start_time, datetime.now()) )