In [None]:
#13:56
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

In [None]:
# Setting up graphics and color palette
from pylab import rcParams
rcParams['figure.figsize'] = 9, 7

sns.set_context('notebook')
sns.set_style('whitegrid')
pal = sns.color_palette('Set2')
sns.set_palette(pal)

import warnings  
warnings.filterwarnings('ignore')

# EDA:

In [None]:
print(train_data.info())
print(test_data.info())

In [None]:
X_train = train_data.drop(columns=['PassengerId', 'Survived', 'Name'])
y_train = train_data['Survived']
X_test = test_data.drop(columns=['PassengerId', 'Name'])
plot_data = X_train.join(y_train)
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Sex', 'Embarked']

In [None]:
X_train[num_cols].describe()

In [None]:
sns.histplot(data=plot_data, x='Age', hue='Survived' , kde=True)

In [None]:
sns.countplot(data=plot_data, x='Embarked', hue='Survived')

In [None]:
sns.countplot(data=plot_data, x='Sex', hue='Survived')

In [None]:
sns.kdeplot(data=plot_data, x='Parch', hue='Survived', multiple='stack', bw_method=.3)

In [None]:
sns.kdeplot(data=plot_data, x='SibSp', hue='Survived', multiple='stack', bw_method=.3)

In [None]:
sns.stripplot(data=plot_data, x='Pclass', y='Fare', hue='Survived', dodge=True)

## Ticket and Cabin

In [None]:
cabins = list(X_train['Cabin'].unique())
cabins = [str(x) for x in cabins]
cabins.remove('nan')
cabins_cat = pd.Series([s[0] for s in cabins])
cabins_num = pd.Series([int(s[1:]) for s in cabins])
print(cabins_cat.unique())
sns.histplot(data = cabins_num)

In [None]:
tickets = list(train_data['Ticket'].unique())
tickets.remove(np.nan)
tickets_formatted = [('xxx', x) if not x.lower().islower() else (x.split(' ')[0].lower().replace('.', ''), x.split(' ')[-1]) for x in tickets]
tickets_cat, tickets_num = list(zip(*tickets_formatted))
tickets_temp = np.array([np.array(x.split('/')) for x in tickets_cat])
tickets_cat_list = []
for x in tickets_temp:
    for y in x:
        tickets_cat_list.append(y)
tickets_cat_list = pd.Series(tickets_cat_list)
tickets_cat = pd.Series(tickets_cat)
tickets_num = pd.Series(tickets_num)
print(tickets_cat_list.unique())

In [None]:
print(pd.Series(train_data['Ticket'].unique()).count())
tickets_num = pd.Series([int(x) for x in tickets_num if x != ''])
sns.histplot(data=tickets_num)
tickets_num

# Preproc:

## Cabin and Ticket

In [None]:
def split_cabin(df):
    data = df.copy()
    data['Cabin'] = data['Cabin'].fillna(value='Z0')
    data['Cabin_cat'] = list(map(lambda s: s[0], data['Cabin']))
    data['Cabin_num'] = list(map(lambda s: int(s[1:]), data['Cabin']))
    return data.drop(columns=['Cabin'])

X_train = split_cabin(X_train)
X_test = split_cabin(X_test)
X_train

In [None]:
def split_ticket(df):
    data = df.copy()
    data['Ticket'] = data['Ticket'].fillna(value='xxx 0')
    tickets_formatted = [('xxx', x) if not x.lower().islower() else (x.split(' ')[0].lower().replace('.', ''), x.split(' ')[-1]) for x in data['Ticket']]
    tickets_cat, tickets_num = list(zip(*tickets_formatted))
    tickets_cat = pd.Series(tickets_cat)
    tickets_num = pd.Series(tickets_num).replace('', '0').astype(int)
    data['Ticket_cat'] = pd.Series(tickets_cat)
    data['Ticket_num'] = pd.Series(tickets_num)
    return data.drop(columns=['Ticket'])

X_train = split_ticket(X_train)
X_test = split_ticket(X_test)
X_train

## Encoding


In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()

In [None]:
def enc_cat(df, enc, train=True):
    data = df.copy()
    data['Embarked'].fillna(value='Z', inplace=True)
    if train:
        data[['Sex', 'Embarked', 'Cabin_cat', 'Ticket_cat']] = enc.fit_transform(data[['Sex', 'Embarked', 'Cabin_cat', 'Ticket_cat']])
    else:
        data[['Sex', 'Embarked', 'Cabin_cat', 'Ticket_cat']] = enc.transform(data[['Sex', 'Embarked', 'Cabin_cat', 'Ticket_cat']])
    data.loc[data['Embarked']==3, ['Embarked']] = np.nan
    return data
    
X_train = enc_cat(X_train, encoder)
X_test = enc_cat(X_test, encoder, train=False)
X_train

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
columns = X_train.columns

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=columns)
X_train.info()

In [None]:
X_train.describe()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=50, weights='distance')

for col in ['Fare', 'Embarked', 'Age']:
    train_na = X_train[X_train[col].isna()].drop(columns=[col]).dropna()
    knn_reg.fit(X_train.dropna().drop(columns = [col]), X_train.dropna()[col])
    X_train.loc[train_na.index, col] = knn_reg.predict(train_na)
    test_na = X_test[X_test[col].isna()].drop(columns=[col]).dropna()
    X_test.loc[test_na.index, col] = knn_reg.predict(test_na)
    print('Filled', col, 'missing values.')

In [None]:
X_train.fillna(value=0, inplace=True)
X_test.fillna(value=0, inplace=True)
X_test.info()

# Pseudo Labeling with QDA

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

qda_model = QDA(reg_param=.35)
qda_model.fit(X_train, y_train)

In [None]:
qda_pred = qda_model.predict_proba(X_test)[:, -2]
thresh = .06
pseudo_labeled_X = np.argwhere(np.logical_or(qda_pred>1-thresh, qda_pred<thresh)).ravel()
pseudo_labeled_X = X_test.loc[pseudo_labeled_X, :]
pseudo_labeled_y = pd.Series(qda_model.predict(pseudo_labeled_X))

In [None]:
X_train = pd.concat([X_train, pseudo_labeled_X], ignore_index=True)
y_train = pd.concat([y_train, pseudo_labeled_y], ignore_index=True)

In [None]:
from sklearn.model_selection import train_test_split

print("# of samples: " + str(y_train.shape[0]))

# Splitting data into train (85%) CV (15%)
X, y = (X_train, y_train)
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size = .1, stratify = y_train, random_state = 69)
y_train = np.array(y_train).astype(np.float32).reshape((-1,1))
y_dev = np.array(y_dev).astype(np.float32).reshape((-1,1))

print("X_train shape: " + str(X_train.shape) + "\t y_train shape:" + str(y_train.shape))
print("X_dev shape:  " + str(X_dev.shape) + "\t y_dev shape: " + str(y_dev.shape))

print(sum(y_train==1))
print(sum(y_dev==1))

# Machine Learning:

## 1st Layer:

### Random Forest 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Grid-Search Tuning Hyper-Params
rf_params= [{
    'min_samples_split': [70, 90, 100, 120]
}]

rf_model = GridSearchCV(
    RandomForestClassifier(), rf_params, scoring='accuracy', verbose=3
)
rf_model.fit(X, np.ravel(y))

In [None]:
print('Best score achieved:', rf_model.best_score_)
print('With params:\n', rf_model.best_params_)

In [None]:
# rf_model = RandomForestClassifier(n_estimators=100, criterion='entropy', min_samples_leaf=20)
# rf_model.fit(X_train, np.ravel(y_train))

# y_pred = rf_model.predict_proba(X_train)
# print('Score on the training set:')
# print(classification_report(y_train, np.around(y_pred[:, 1])))
# print('roc_auc score: ', end='')
# print(roc_auc_score(y_train, y_pred[:, 1]))
# print('f1 score:', f1_score(y_train,np.around(y_pred[:, 1])), end='\n\n')

# y_pred = rf_model.predict_proba(X_dev)
# print('Score on the dev set:')
# print(classification_report(y_dev, np.around(y_pred[:, 1])))
# print('roc_auc score: ', end='')
# print(roc_auc_score(y_dev, y_pred[:, 1]))
# print('f1 score:', f1_score(y_dev,np.around(y_pred[:, 1])), end='\n\n')

### XGBoost 

In [None]:
import xgboost as xgb

xgb_params= [{
    'max_depth': [2, 3],
    'min_child_weight': [60, 70, 80],
    'lambda': [2.2, 2.5, 2.8]
}]

xgb_model = GridSearchCV(
    xgb.XGBClassifier(objective="binary:logistic", eval_metric='rmse', use_label_encoder = False, random_state=42),
    xgb_params, scoring='accuracy', verbose=3
)
xgb_model.fit(X, np.ravel(y))

In [None]:
print('Best score achieved:', xgb_model.best_score_)
print('With params:\n', xgb_model.best_params_)

In [None]:
# y_pred = xgb_model.predict_proba(X_train)
# print('Score on the training set:')
# print(classification_report(y_train, np.around(y_pred[:, 1])))
# print('roc_auc score: ', end='')
# print(roc_auc_score(y_train, y_pred[:, 1]))
# print('f1 score:', f1_score(y_train,np.around(y_pred[:, 1])), end='\n\n')

# y_pred = xgb_model.predict_proba(X_dev)
# print('Score on the dev set:')
# print(classification_report(y_dev, np.around(y_pred[:, 1])))
# print('roc_auc score: ', end='')
# print(roc_auc_score(y_dev, y_pred[:, 1]))
# print('f1 score:', f1_score(y_dev,np.around(y_pred[:, 1])), end='\n\n')

### ANN:

In [None]:
def dfify(hist):
	df = pd.DataFrame(hist.history)
	df['epoch'] = df.index
	val_cols = [x for x in df.columns if x.startswith('val')]
	df_val = df[val_cols+['epoch']]
	df.drop(columns=val_cols, inplace=True)
	df_val.rename(columns={col: col.split('val_')[-1] for col in df_val.columns}, inplace=True)
	df['phase'] = 'train'
	df_val['phase'] = 'val'
	return pd.concat([df, df_val], ignore_index=True)

def visu_history(hist):
    rcParams['figure.figsize'] = 20, 7
    hist_df = dfify(hist)
    fig, axes = plt.subplots(1, 2)
    grid = sns.lineplot(data = hist_df, x='epoch', y='loss', hue='phase', ax=axes[0])
    grid.set(yscale='log')
    sns.lineplot(data = hist_df, x='epoch', y='accuracy', hue='phase', ax=axes[1])
    plt.show()

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.metrics import classification_report

tf.keras.backend.clear_session()

regu = lambda y : tf.keras.regularizers.L2(l2=y)

def make_model(optimizer, loss_fn, metrics, output_bias='zeros', dropout=0, l2regu=0):
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(X_train.shape[-1],)),
        tf.keras.layers.Dense(100, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal, kernel_regularizer=regu(l2regu)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(50, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal, kernel_regularizer=regu(l2regu)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(30, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal, kernel_regularizer=regu(l2regu)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(10, activation='relu', kernel_initializer=tf.keras.initializers.HeNormal, kernel_regularizer=regu(l2regu)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(3, activation='tanh', kernel_initializer=tf.keras.initializers.GlorotNormal, kernel_regularizer=regu(l2regu)),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.initializers.GlorotNormal, bias_initializer=output_bias)
    ])
    
    model.compile(
        optimizer=optimizer,
        loss=loss_fn,
        metrics=metrics
    )
    
    return model


loss_fn = tf.losses.BinaryCrossentropy()

optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3, momentum=1)

ann_model = make_model(optimizer, loss_fn, ['accuracy'])
ann_model.summary()

Overfitting a random 20 rows:

In [None]:
history = ann_model.fit(X_train[100:120], y_train[100:120], epochs=300, batch_size=1024, verbose=2)

In [None]:
visu_history(history)

Overfitting:

In [None]:
ann_model = make_model(tf.keras.optimizers.Adam(learning_rate=.004), loss_fn, ['accuracy'])
history = ann_model.fit(X_train, y_train, epochs=1000, batch_size=2048, verbose=2, validation_data=(X_dev, y_dev))

In [None]:
visu_history(history)

Regularization:

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=200, mode='max', restore_best_weights=True, verbose=2)
ann_model = make_model(tfa.optimizers.AdamW(learning_rate=.004, weight_decay=7e-5), loss_fn, ['accuracy'], dropout=0.05, l2regu=2e-5)
history = ann_model.fit(
    X_train, y_train, epochs=500, batch_size=2048, 
    callbacks=[callback], validation_data=(X_dev, y_dev), verbose=2
)

In [None]:
visu_history(history)

## Hyper parameter Tuning

In [None]:
iterations = 50
# Parameters:
l_rate_range = 10**np.random.uniform(-4.5, -1.5, iterations)
w_decay_range = 10**np.random.uniform(-2.5, -5.5, iterations)
dropout_range = np.random.uniform(.01, .1, iterations)
lambd_regu_range = 10**np.random.uniform(-3, -7, iterations)

#combos = [(0.00018754084977224016, 2.4085576475004506e-05, 0.04475658897898782, 0.0002886421264356717)]
combos = list(zip(l_rate_range, w_decay_range, dropout_range, lambd_regu_range))
best_accuracy = 0

combo_scores = pd.DataFrame(columns=['l_rate', 'w_decay', 'droupout', 'lambd_regu', 'score'])

# Same initial weights for consistency:
ann_model = make_model(tfa.optimizers.AdamW(learning_rate=2e-4, weight_decay=2e-5), loss_fn, ['accuracy'], dropout=.045, l2regu=2e-5)
ann_model.save_weights('initial_weights')

In [None]:
i=0
for l_rate, w_decay, dropout, lambd_regu in combos:
    i=i+1
    print('********* iteration', i, '/', iterations,'*********')
    print('L_rate:', l_rate, '\tw_decay:', w_decay, '\tdropout:', dropout, '\tlambd_regu:', lambd_regu)
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=500, mode='max', restore_best_weights=True, verbose=2)
    ann_model = make_model(tfa.optimizers.AdamW(learning_rate=l_rate, weight_decay=w_decay), loss_fn, ['accuracy'], dropout=dropout, l2regu=lambd_regu)
    ann_model.load_weights('initial_weights')
    history = ann_model.fit(
        X_train, y_train, epochs=1500, batch_size=2048, 
        callbacks=[callback], validation_data=(X_dev, y_dev), verbose=0
    )
    
    val_acc = ann_model.evaluate(X_dev, y_dev, batch_size=2048)[1]
    combo_scores.loc[i-1] = (*combos[i-1], val_acc)
    
    print('score:', val_acc)
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        best_history = history
        ann_model.save_weights('best_weights')
        print('BEST SCORE YET!!!')
        
ann_model.load_weights('best_weights')

In [None]:
ann_model.load_weights('best_weights')

In [None]:
combo_scores.loc[combo_scores['score'].argmax()]

In [None]:
visu_history(best_history)

In [None]:
rcParams['figure.figsize'] = 20, 13
min_score = .838
max_score = .84
fig, axes = plt.subplots(2,3)
g = sns.scatterplot(data=combo_scores, x='l_rate', y='w_decay', hue='score', hue_norm=(min_score, max_score), ax=axes[0,0])
g.set(xscale='log', yscale='log')
g = sns.scatterplot(data=combo_scores, x='l_rate', y='droupout', hue='score', hue_norm=(min_score, max_score), ax=axes[0,1])
g.set(xscale='log')
g = sns.scatterplot(data=combo_scores, x='l_rate', y='lambd_regu', hue='score', hue_norm=(min_score, max_score), ax=axes[0,2])
g.set(xscale='log', yscale='log')
g = sns.scatterplot(data=combo_scores, x='droupout', y='w_decay', hue='score', hue_norm=(min_score, max_score), ax=axes[1,0])
g.set(yscale='log')
g = sns.scatterplot(data=combo_scores, x='lambd_regu', y='w_decay', hue='score', hue_norm=(min_score, max_score), ax=axes[1,1])
g.set(xscale='log', yscale='log')
g = sns.scatterplot(data=combo_scores, x='droupout', y='lambd_regu', hue='score', hue_norm=(min_score, max_score), ax=axes[1,2])
g.set(yscale='log')

for ax in np.ravel(axes):
    ax.get_legend().remove()

plt.show()

In [None]:
y_pred = ann_model.predict(X_train)
print('Score on the training set:')
print(classification_report(y_train, np.around(y_pred)))
print('roc_auc score: ', end='')
print(roc_auc_score(y_train, y_pred))
print('f1 score:', f1_score(y_train,np.around(y_pred)), end='\n\n')

y_pred = ann_model.predict(X_dev)
print('Score on the dev set:')
print(classification_report(y_dev, np.around(y_pred)))
print('roc_auc score: ', end='')
print(roc_auc_score(y_dev, y_pred))
print('f1 score:', f1_score(y_dev,np.around(y_pred)), end='\n\n')

# Ensemble Model (Layer 2)

In [None]:
class ensemble:
    def __init__(self, prev_layer, esbl_model):
        self.esbl_model = esbl_model
        self.prev_layer = prev_layer
        self.prev_layer_pred_train = np.array([])
        self.prev_layer_pred = np.array([])
        
    def fit(self, X_loc, y_loc):
        self.prev_layer_pred_train = np.zeros(shape=(len(X_loc), len(self.prev_layer)))
        for i in range(len(self.prev_layer)):
            self.prev_layer_pred_train[:, i] = self.prev_layer[i].predict_proba(X_loc)[:, -1]
        
        self.esbl_model.fit(self.prev_layer_pred_train, np.ravel(y_loc))
        
    def predict_prev(self, X_loc):
        self.prev_layer_pred = np.zeros(shape=(len(X_loc), len(self.prev_layer)))
        for i in range(len(self.prev_layer)):
            self.prev_layer_pred[:, i] = self.prev_layer[i].predict_proba(X_loc)[:, -1]
        return pd.DataFrame(self.prev_layer_pred)
    
    def predict_proba(self, X_loc):
        self.predict_prev(X_loc)
        return self.esbl_model.predict_proba(self.prev_layer_pred)
    
    def predict(self, X_loc):
        self.predict_prev(X_loc)
        return self.esbl_model.predict(self.prev_layer_pred)

In [None]:
xgb_esbl = xgb.XGBClassifier(objective="binary:logistic", eval_metric='rmse', use_label_encoder = False, random_state=42)
rf_esbl = RandomForestClassifier(min_samples_split=100)
esbl_model = ensemble([rf_model, xgb_model, ann_model], xgb_esbl)
esbl_model.fit(X_train, y_train)

In [None]:
y_pred = esbl_model.predict(X_train)
print('Score on the training set:')
print(classification_report(y_train, np.around(y_pred)))
print('roc_auc score: ', end='')
print(roc_auc_score(y_train, y_pred))
print('f1 score:', f1_score(y_train,np.around(y_pred)), end='\n\n')

y_pred = esbl_model.predict(X_dev)
print('Score on the dev set:')
print(classification_report(y_dev, np.around(y_pred)))
print('roc_auc score: ', end='')
print(roc_auc_score(y_dev, y_pred))
print('f1 score:', f1_score(y_dev,np.around(y_pred)), end='\n\n')

# Submission:

In [None]:
first_layer_pred = esbl_model.predict_prev(X_test)

In [None]:
# Choose the model:
model = ann_model
decision = np.around(model.predict(X_test).ravel()).astype(int)
# decision = np.around(first_layer_pred.mean(axis=1)).astype(int)

submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': decision})

from IPython.display import HTML
import base64
def create_download_link(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(submission)