In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf

## importing data

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv",index_col="PassengerId")
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
for col in train.columns:
    print(f"{col}'s nunique values: {train[col].nunique()}")
    

In [None]:
categorical_columns = [col for col in train.columns if train[col].nunique()<20]

In [None]:
categorical_columns

In [None]:
num_columns = [col for col in train.columns if train[col].dtype=="float64"]

In [None]:
num_columns

# EDA

In [None]:
def show_na(df):
    figsize=(12,6)
    plt.figure(figsize=figsize)
    sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
    plt.show()
    print(df.isnull().sum())
show_na(train)

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Transported',data=train)

In [None]:
plt.figure(figsize=(8,4))
sns.set_style('whitegrid')
sns.countplot(x='Transported',hue='HomePlanet',data=train)

In [None]:
plt.figure(figsize=(8,4))
sns.set_style('whitegrid')
sns.countplot(x='Transported',hue='VIP',data=train)

In [None]:
plt.figure(figsize=(8,4))
train['Age'].hist(bins=30,color='darkred')

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='CryoSleep',data=train)

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='VIP',y='Age',data=train)

In [None]:
train["Destination"].value_counts()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x="Destination",data=train,hue="Transported")

In [None]:
plt.figure(figsize=(32,6))
sns.countplot(x="Age",data=train[train["Age"]>40],hue="Transported")

In [None]:
plt.figure(figsize=(32,6))
sns.countplot(x="Age",data=train[(train["Age"]>=21) & (train["Age"]<=40) ],hue="Transported")

In [None]:
plt.figure(figsize=(32,6))
sns.countplot(x="Age",data=train[train["Age"]<21],hue="Transported")

sns.pairplot(data=train[num_columns+["Transported"]],hue="Transported")

In [None]:
train[["Age", 'HomePlanet']].groupby( 'HomePlanet').mean().values

# FEATURE ENGINEERING

## Age nan values

In [None]:
def age(column):
    Age = column[0]
    HomePlanet = column[1]
    
    if pd.isnull(Age):

        if HomePlanet == 1:
            return train[["Age",'HomePlanet']].groupby('HomePlanet').mean().values[0][0]

        elif HomePlanet == 2:
            return train[["Age",'HomePlanet']].groupby('HomePlanet').mean().values[1][0]

        else:
            return train[["Age",'HomePlanet']].groupby('HomePlanet').mean().values[2][0]

    else:
        return Age

In [None]:
train['Age'] = train[['Age','HomePlanet']].apply(age,axis=1)
test['Age'] = test[['Age','HomePlanet']].apply(age,axis=1)

In [None]:
show_na(train)

In [None]:
train[train["HomePlanet"].isnull()]

## Surname nan values

In [None]:
def surname(df):
    surnames = []
    for name in df["Name"]:
        try: 
            surname = name.split()[1]
        except:
            surname = np.nan
        surnames.append(surname)
    df["Surname"] = surnames
surname(train)
surname(test)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
train[train['Cabin'].isnull()]

In [None]:
train[['RoomService','Cabin']].groupby("Cabin").mean()

In [None]:
show_na(train)

In [None]:
train[train["CryoSleep"]==True]

## 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' nan values

In [None]:
num_columns

In [None]:
num_columns.remove('Age')

In [None]:
num_columns

In [None]:
train[num_columns] = train[num_columns].fillna(0)
test[num_columns] = test[num_columns].fillna(0)

## Cryosleep nan values

In [None]:
nan_Cryo_train = train[train["CryoSleep"].isnull()].index
nan_Cryo_test = test[test["CryoSleep"].isnull()].index
nan_Cryo_train,nan_Cryo_test

In [None]:
def fill_cryo(df,nan_index): 
    for row in nan_index:
        if df["RoomService"].loc[row] == 0 and df["FoodCourt"].loc[row] == 0 and df["ShoppingMall"].loc[row] == 0 and df["Spa"].loc[row] == 0 and df["VRDeck"].loc[row] == 0:
            df["CryoSleep"].loc[row] = True
        else:
            df["CryoSleep"].loc[row] = False

fill_cryo(train,nan_Cryo_train)
fill_cryo(test,nan_Cryo_test)

In [None]:
show_na(train)

## VIP nan values

In [None]:
train.drop("Name",axis=1,inplace=True)
test.drop("Name",axis=1,inplace=True)

In [None]:
train[train["VIP"].notna()].describe().T

In [None]:
train[train["VIP"]==True].describe().T

In [None]:
train[train["VIP"]==False].describe().T

In [None]:
nan_vip_train = train[train["VIP"].isnull()].index
nan_vip_test = test[test["VIP"].isnull()].index
nan_vip_train,nan_vip_test

In [None]:
train[train["VIP"]==False].describe().T["75%"]

In [None]:
def fill_vip(df,nan_index): 
    values75 = df[df["VIP"]==False].describe().T["75%"]
    
    for row in nan_index:
        if df["Age"].loc[row]>values75[0] and df["RoomService"].loc[row]>values75[1] and  df["FoodCourt"].loc[row]>values75[2] and df["ShoppingMall"].loc[row]>values75[3] and df["Spa"].loc[row]>values75[4] and df["VRDeck"].loc[row]>values75[5]:
            df["VIP"].loc[row] = True
        else:
            df["VIP"].loc[row] = False

fill_vip(train,nan_vip_train)
fill_vip(test,nan_vip_test)

In [None]:
show_na(train)

## Splitting cabin features 

In [None]:
train[train["Cabin"].notna()]

In [None]:
import re

In [None]:
re.split('/', train["Cabin"][0])

In [None]:
train.index

In [None]:
def split_cabin(df):
    df["Deck"]= np.nan  
    df["Num"]=np.nan
    df["Side"]=np.nan
    for n in df.index:
        try:
            wxy=re.split('/', df["Cabin"][n])
            df["Deck"].loc[n]=wxy[0]
            df["Num"].loc[n]=int(wxy[1])
            df["Side"].loc[n]=wxy[2]
        except:
            df["Deck"].loc[n]=np.nan
            df["Num"].loc[n]=np.nan
            df["Side"].loc[n]=np.nan
    
split_cabin(train)
split_cabin(test)   

In [None]:
train.drop("Cabin",axis=1,inplace=True)
test.drop("Cabin",axis=1,inplace=True)

In [None]:
train

In [None]:
train[train["Deck"].notna()]

In [None]:
deck_keys = pd.DataFrame(train[["Surname","Deck"]].value_counts()).drop(0,axis=1)
side_keys = pd.DataFrame(train[["Surname","Side"]].value_counts()).drop(0,axis=1)
num_keys = pd.DataFrame(train[["Surname","Num"]].value_counts()).drop(0,axis=1)

In [None]:
num_keys

In [None]:
train["Deck"]

In [None]:
show_na(train)

In [None]:
row = 100
train["Deck"].loc[row] , deck_keys.loc[train["Surname"].loc[row]].index[0]

In [None]:
train["Deck"].value_counts().index[0]

In [None]:
def fill_deck(df):
    index = df[df["Deck"].isnull()].index
    for row in index:
        try:
            df["Deck"].loc[row] = deck_keys.loc[df["Surname"].loc[row]].index[0]
        except:
            df["Deck"].loc[row] = np.nan
    df["Deck"].fillna(df["Deck"].value_counts().index[0],inplace=True) # for remaining nan values 
def fill_side(df):
    index = df[df["Side"].isnull()].index
    for row in index:
        try:
            df["Side"].loc[row] = side_keys.loc[df["Surname"].loc[row]].index[0]
        except:
            df["Side"].loc[row] = np.nan
    df["Side"].fillna(df["Side"].value_counts().index[0],inplace=True) # for remaining nan values
def fill_num(df):
    index = df[df["Num"].isnull()].index
    for row in index:
        try:
            df["Num"].loc[row] = num_keys.loc[df["Surname"].loc[row]].index[0]
        except:
            df["Num"].loc[row] = np.nan
    df["Num"].fillna(df["Num"].value_counts().index[0],inplace=True) # for remaining nan values
    
    
fill_deck(train)
fill_deck(test)
fill_side(train)
fill_side(test)
fill_num(train)
fill_num(test)

In [None]:
show_na(train)

In [None]:
train[train["Surname"].isnull()]

In [None]:
def check(df,deck,num):
    A = df[df["Surname"].notna()][(df[df["Surname"].notna()]["Deck"]==deck) & (df[df["Surname"].notna()]["Num"]==num)]["Surname"]
    return A

In [None]:
type(check(test,"G",1490.0).values[0]) == str

In [None]:
type(check(train,train["Deck"].loc[row],train["Num"].loc[row]).values[0]) == str

In [None]:
check(train,train["Deck"].loc[row],train["Num"].loc[row]).values[0]

In [None]:
import random

## Surname nan values

In [None]:

def fill_surname(df):
    num = 0
    total = len(df[df["Surname"].isnull()].index)
    for row in df[df["Surname"].isnull()].index:
        try:
            try:
                if type(check(train,train["Deck"].loc[row],train["Num"].loc[row]).values[0]) == str:
                    df["Surname"].loc[row] = check(train,train["Deck"].loc[row],train["Num"].loc[row]).values[0]
            except:
                if type(check(test,test["Deck"].loc[row],test["Num"].loc[row]).values[0]) == str:
                    df["Surname"].loc[row] = check(test,test["Deck"].loc[row],test["Num"].loc[row]).values[0]
        except:
            df["Surname"].loc[row] = random.choice(list(train["Surname"].value_counts().index)+list(test["Surname"].value_counts().index))
            num +=1
    print(f"total nan surname values: {total} {num} surname values filled with random choice {total-num} surname values filled from train or test ")
fill_surname(train)
fill_surname(test)

In [None]:
show_na(train)

In [None]:
train["Destination"].value_counts()

In [None]:
train["HomePlanet"].value_counts()

In [None]:
train[train["Destination"].notna()][train[train["Destination"].notna()]["CryoSleep"]==True]["Destination"].value_counts()

In [None]:
train[train["Destination"].notna()][train[train["Destination"].notna()]["CryoSleep"]==True]["HomePlanet"].value_counts()

Potential course of spaceship  Earth → Europa → Mars →→ PSO → 55 Cancri e → Trappist 

In [None]:
train[train["Destination"].isnull()]

In [None]:
train[train["Destination"].notna()]

In [None]:
show_na(train)

## HomePlanet and Destination nan values

In [None]:
def fill_homePlanet_destination(df):
    for name in ["HomePlanet","Destination"]:
        for row in df[df[name].isnull()].index:
                df[name].loc[row] = random.choice(list(train[name].value_counts().index))
                
fill_homePlanet_destination(train)
fill_homePlanet_destination(test)

In [None]:
show_na(train),show_na(test)

In [None]:
train.head()

In [None]:
test.head()

# Dummies

In [None]:
train.set_index("PassengerId",inplace=True)


In [None]:
train["Num"]=train["Num"].astype(int)
test["Num"]=test["Num"].astype(int)


In [None]:
dict(zip(list(train[col].unique()),[0,1]))

In [None]:
train["Side"].nunique()

In [None]:
def get_dummies(df):
    drop_cols =[]
    dummies = []
    
    
    for col in df.columns:
        if  type(df[col][0]) == bool:
            df[col] = df[col].astype(int)
            print( col,"bool " )
        elif type(df[col][0]) != bool and df[col].nunique() == 2:
            print(col,df[col].nunique(),"nunique: 2")
            df[col].replace(dict(zip(list(df[col].unique()),[0,1])),inplace=True)
        elif df[col].nunique() > 2 and df[col].dtype == "O" and df[col].nunique() < 10:
            print(col,df[col].nunique() ,"dummy")
            B  = pd.get_dummies(df[col],prefix=col)
            dummies.append(B)

            drop_cols.append(col)
        else:
            pass
    df.drop(drop_cols,axis=1,inplace=True)
    A = pd.concat(dummies,axis=1)
    df = pd.concat([df, A ],axis=1)
    return df
train = get_dummies(train)
test = get_dummies(test)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.drop("Surname",axis=1,inplace=True)
test.drop("Surname",axis=1,inplace=True)

In [None]:
train

# Statistical Features

In [None]:
def new_feats(df):
    df["mean"] = df.iloc[:,4:9].mean(axis=1).values
    df["std"] = df.iloc[:,4:9].std(axis=1).values
    df["sum"] = df.iloc[:,4:9].sum(axis=1).values
    df["var"] = df.iloc[:,4:9].var(axis=1).values
#     df["max"] = df.iloc[:,4:9].max(axis=1).values
#     df["min"] = df.iloc[:,4:9].min(axis=1).values
    df["ptp"] = df.iloc[:,4:9].values.ptp(axis=1)
    df["logsumexp"] = tf.math.reduce_logsumexp(df.iloc[:,4:9],axis=1).numpy()
    df["euc"] =tf.math.reduce_euclidean_norm(df.iloc[:,4:9],axis=1).numpy()
    df["spavsvrdeck"] = (df["Spa"]>df["VRDeck"]).astype(int)
    df["rmsvsshp"] = (df["RoomService"]>df["ShoppingMall"]).astype(int)
    df["rmsvsfc"] = (df["RoomService"]>df["FoodCourt"]).astype(int)
new_feats(train)
new_feats(test)

In [None]:
train.head()

In [None]:
test.head()

# Training model

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
from xgboost import XGBClassifier


In [None]:
train

## learning rate decay, callbacks

In [None]:
lr = 0.001
epochs = 50
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.024)


def plot_lr_decay(epochs,lr):
    x = np.arange(0,epochs)
    lrs = []
    ylr=lr
    for epoch in x:
        lr =  scheduler(epoch,lr)
        lrs.append(lr)
    y = np.array(lrs)
    plt.figure(figsize=(8,4))
    plt.plot(x,y)
    plt.vlines(x=10-1,linestyles="--",colors="r",ymin=y[-1],ymax=ylr)
    plt.xlabel("epochs")
    plt.ylabel("learning rate")
    plt.title("learning rate decay")
    plt.show()

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
       
        # Check accuracy
        val_acc = 0.88
        
        if (logs.get('val_acc') > val_acc) :

            # Stop if threshold is met
            print(f"val_acc is reached {val_acc}  so cancelling training!")
            print("")
            self.model.stop_training = True
            

stopCallback = myCallback()
earlyStop = tf.keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)
lrDecay = tf.keras.callbacks.LearningRateScheduler(scheduler)
plot_lr_decay(epochs,lr)

## defining model

In [None]:
def make_model(input_shape):
    model = tf.keras.Sequential([tf.keras.Input(input_shape),
                                 tf.keras.layers.Dense(32,activation="relu"),
#                                  tf.keras.layers.Dropout(0.4),
                                tf.keras.layers.Dense(32,activation="relu"),
#                                  tf.keras.layers.Dropout(0.4),
                                tf.keras.layers.Dense(10,activation="relu"),
                                 tf.keras.layers.Dropout(0.15),
                                tf.keras.layers.Dense(1,activation="sigmoid")],name = "N_Nets")
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer = optimizer , loss = "binary_crossentropy", metrics = ["acc"])
    return model

## loss and accuracy report

In [None]:
def plot_loss_acc(history,y_true,prediction):
    _,ax = plt.subplots( ncols=3,nrows=1,figsize=(20,3))
    ax[1].set_xlabel("epochs")
    ax[1].set_ylabel("loss")
    ax[1].set_title("final val_loss %1.4f"%(history.history["val_loss"][-1:][0]))
    ax[2].set_xlabel("epochs")
    ax[2].set_ylabel("acc")
    ax[2].set_title("final val_acc %1.4f"%(history.history["val_acc"][-1:][0]))
    ax[0].set_xlabel("epochs")
    ax[0].set_ylabel("learning rate")
    ax[0].set_title("final lr %1.4f"%(history.history["lr"][-1:][0]))
   
    pd.DataFrame([history.history["lr"]],index=["learning rate"]).T.plot(ax=ax[0])
    pd.DataFrame([history.history["acc"],history.history["val_acc"]],index=["acc","val_acc"]).T.plot(ax=ax[2])
    pd.DataFrame([history.history["loss"],history.history["val_loss"]],index=["loss","val_loss"]).T.plot(ax=ax[1])
    plt.show()

## confusiıon matrix report

In [None]:
def plot_cm(y_true, prediction, p=0.5):
    cm = confusion_matrix(y_true, prediction > p)
    plt.figure(figsize=(3,3))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    
    print('Not Transported Detected (True Negatives): ', cm[0][0])
    print('Not Transported Incorrectly Detected (False Positives): ', cm[0][1])
    print('Transported Missed (False Negatives): ', cm[1][0])
    print('Transported Detected (True Positives): ', cm[1][1])
    print('Total Transported: ', np.sum(cm[1]))
    plt.show()

In [None]:
from sklearn.metrics import f1_score,accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
from IPython.display import clear_output

## kfold and training model

In [None]:
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train.columns

In [None]:
# cols = (train.drop("Transported",axis = 1)).columns.to_list()

<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>acc</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>      <th>None</th>    </tr>  </thead>  <tbody>    <tr>      <th>t1</th>      <td>70</td>      <td>mean</td>      <td>std</td>      <td>sum</td>      <td>var</td>      <td>ptp</td>      <td>logsumexp</td>      <td>euc</td>      <td>spavsvrdeck</td>      <td>rmsvsshp</td>      <td>rmsvsfc</td>    </tr>    <tr>      <th>t2</th>      <td>77</td>      <td>CryoSleep</td>      <td>Age</td>      <td>VIP</td>      <td>RoomService</td>      <td>FoodCourt</td>      <td>ShoppingMall</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>    </tr>    <tr>      <th>t3</th>      <td>79</td>      <td>RoomService</td>      <td>FoodCourt</td>      <td>ShoppingMall</td>      <td>Spa</td>      <td>VRDeck</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>    </tr>    <tr>      <th>t4</th>      <td>73</td>      <td>CryoSleep</td>      <td>Age</td>      <td>VIP</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>    </tr>    <tr>      <th>t5</th>      <td>56</td>      <td>Num</td>      <td>Side</td>      <td>Deck_A</td>      <td>Deck_B</td>      <td>Deck_C</td>      <td>Deck_D</td>      <td>Deck_E</td>      <td>Deck_F</td>      <td>Deck_G</td>      <td>Deck_T</td>    </tr>    <tr>      <th>t6</th>      <td>58</td>      <td>HomePlanet_Earth</td>      <td>HomePlanet_Europa</td>      <td>HomePlanet_Mars</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>    </tr>    <tr>      <th>t7</th>      <td>49</td>      <td>Destination_55 Cancri e</td>      <td>Destination_PSO J318.5-22</td>      <td>Destination_TRAPPIST-1e</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>      <td>None</td>    </tr>    <tr>      <th>t8</th>      <td>79</td>      <td>RoomService</td>      <td>FoodCourt</td>      <td>ShoppingMall</td>      <td>Spa</td>      <td>VRDeck</td>      <td>CryoSleep</td>      <td>Age</td>      <td>VIP</td>      <td>None</td>      <td>None</td>    </tr>  </tbody></table>

* 70, 'mean', 'std', 'sum', 'var', 'ptp', 'logsumexp', 'euc', 'spavsvrdeck', 'rmsvsshp', 'rmsvsfc',,
* 77, 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
* 79, 'RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck',
* 73, 'CryoSleep', 'Age', 'VIP'
* 56, 'Num', 'Side','Deck_A','Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T',
* 58, 'HomePlanet_Earth','HomePlanet_Europa', 'HomePlanet_Mars', 
* 49,'Destination_55 Cancri e','Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
* 79,'RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck','CryoSleep', 'Age', 'VIP'

In [None]:
cols =['RoomService', 'FoodCourt', 'ShoppingMall','Spa',
      'CryoSleep', 'Age', 'VIP'] #!!! exclude Transported

In [None]:
scaler = StandardScaler()

X=train[cols]
y=train['Transported']
scaler.fit(X)
X = scaler.transform(X)
test2 = scaler.transform(test[cols])
input_shape=(X.shape[1])

In [None]:
cv = 4
num_val_samples = len(X) // cv
all_scores = []
test_preds = []
xgb_preds = []
rfc_preds = []
etc_preds = []
scores_nn = 0 
scores_xgb = 0 
scores_etc = 0 
scores_rfc = 0 

for i in range(cv):
    print("#"*15+' processing fold',i+1,15*"#")
    val_data = X[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = y[i * num_val_samples: (i + 1) * num_val_samples]
    partial_train_data = np.concatenate(
                                        [X[:i * num_val_samples],
                                        X[(i + 1) * num_val_samples:]],
                                        axis=0)
    partial_y = np.concatenate(
                                            [y[:i * num_val_samples],
                                            y[(i + 1) * num_val_samples:]],
                                            axis=0)
    
    xgb = XGBClassifier( n_estimators = 1000,verbosity=0)
    etc = ExtraTreesClassifier( criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           bootstrap=True,
                                           random_state=(i+1)*(897),
                                           n_jobs=-1,
                                           verbose=0)
    rfc = RandomForestClassifier( criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=(i+1)*(897),
                                           n_jobs=-1,
                                           verbose=0)
    xgb.fit(partial_train_data, partial_y)
    etc.fit(partial_train_data, partial_y)
    rfc.fit(partial_train_data, partial_y)
    with strategy.scope():
        model = make_model(input_shape)
        history = model.fit(partial_train_data, partial_y, epochs=epochs,
                  validation_data=(val_data,val_targets), batch_size=32,
                  callbacks = [earlyStop,lrDecay,stopCallback],verbose=0)
    
    prediction = (model.predict(val_data).squeeze()>0.5).astype(int)
    conf_mat = confusion_matrix(val_targets,prediction)  
    class_rep = classification_report(val_targets,prediction)
    prediction_test = model.predict(test2).squeeze()
    xgb_val = xgb.predict(val_data)
    etc_val = etc.predict(val_data)
    rfc_val = rfc.predict(val_data)
    xgb_test = xgb.predict_proba(test2)[:,1]
    etc_test = etc.predict_proba(test2)[:,1]
    rfc_test = rfc.predict_proba(test2)[:,1]
    xgb_preds.append(xgb_test)
    etc_preds.append(etc_test)
    rfc_preds.append(rfc_test)
    f1 =  f1_score(val_targets,prediction)
    acc_xgb = accuracy_score(val_targets,xgb_val)
    acc_etc = accuracy_score(val_targets,etc_val)
    acc_rfc = accuracy_score(val_targets,rfc_val)
    acc = accuracy_score(val_targets,prediction)
    all_scores.append(history.history["val_acc"][-1:][0])
    test_preds.append(prediction_test.tolist())
    plot_loss_acc(history,val_targets,prediction)
    plot_cm(val_targets, prediction, p=0.5)
    scores_nn += acc 
    scores_xgb += acc_xgb 
    scores_etc += acc_etc 
    scores_rfc += acc_rfc 


    print(class_rep)
    print(f"acc scores nn:{acc:.3f}, xgb:{acc_xgb:.3f}, etc:{acc_etc:.3f}, rfc:{acc_rfc:.3f}")
print(f"final avg. acc scores nn:{scores_nn/cv:.3f}, xgb:{scores_xgb/cv:.3f}, etc:{scores_etc/cv:.3f}, rfc:{scores_rfc/cv:.3f}")

# Results

In [None]:
nn_pred = np.array(test_preds).mean(axis=0)
xgb_pred = np.array(xgb_preds).mean(axis=0)
etc_pred = np.array(etc_preds).mean(axis=0)
rfc_pred = np.array(rfc_preds).mean(axis=0)

In [None]:
submission["Transported"]=nn_pred*0.25+xgb_pred*0.25+etc_pred*0.25+rfc_pred*0.25

In [None]:
submission["Transported"]=(submission["Transported"]>0.5).astype(bool)

In [None]:
submission.to_csv("submission.csv",index=False)

In [None]:
!head submission.csv