# Import libraries and load data

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.utils import resample

from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# split train data in training - validation (80 - 20)
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=111)

# Exploratory analysis

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']].describe()

# Preprocessing

In [None]:
# generate new cols
train_data['Alone'] = 0
train_data.loc[(train_data['SibSp']==0) & (train_data['Parch']==0), 'Alone'] = 1

train_data['Woman_or_child'] = 0
train_data.loc[(train_data['Sex']=='female') | (train_data['Age']<14), 'Woman_or_child'] = 1

train_data['Family_size'] = train_data['Parch'] + train_data['SibSp']

In [None]:
# transform cabin column (replace nan with ' ' and remove the numbers)
train_data['Cabin_letter'] = train_data['Cabin'].replace(np.nan, ' ').map(lambda x: x[0])
train_data.drop('Cabin', axis=1, inplace=True)

In [None]:
# group age: children 0-14, youth 14-25, adults 25-64, seniors 65-...
def categorize_age(age):
    if age < 14:
        return 0
    elif age < 25:
        return 1
    elif age < 65:
        return 2
    else:
        return 3

train_data['Age_category'] = train_data['Age'].apply(lambda x: categorize_age(x))

In [None]:
# group fare: 0-10, 10-25, 25-40, 40-...
def categorize_fare(fare):
    if fare < 10:
        return 0
    elif fare < 25:
        return 1
    elif fare < 40:
        return 2
    else:
        return 3

train_data['Fare_category'] = train_data['Fare'].apply(lambda x: categorize_fare(x))

In [None]:
# has or not a ticket
mask = train_data['Ticket'].isna()
train_data['Ticket'] = 1
train_data.loc[mask, 'Ticket'] = 0

In [None]:
# create feature old person with family
train_data['Old_with_family'] = 0
train_data.loc[(train_data['Age_category']==3) & (train_data['Alone']==0), 'Old_with_family'] = 1

# Data visualization

In [None]:
categorical_ordinal_cols = ['Pclass', 
                    'Sex', 
                    'SibSp', 
                    'Parch', 
                    'Cabin_letter', 
                    'Embarked', 
                    'Woman_or_child',
                    'Fare_category',
                    'Alone',
                    'Age_category',
                    'Ticket',
                    'Old_with_family'
#                     'Family_size'
                   ]

In [None]:
for col in categorical_ordinal_cols:
    grouped = train_data.groupby(['Survived', col]).count().reset_index()
    for row in grouped.index:  # get the probabilities
        pos = grouped.loc[row, col]
        grouped.loc[row, 'PassengerId'] /= train_data[col].value_counts().loc[pos]
        
    width = 0.35

    fig, ax = plt.subplots()

    ax.bar(x=np.arange(len(grouped[col].unique())) - width/2,
            height=grouped[grouped['Survived']==0]['PassengerId'],
            width=width,
            label='No Survived')
    ax.bar(x=np.arange(len(grouped[col].unique())) + width/2, 
            height=grouped[grouped['Survived']==1]['PassengerId'],
            width=width,
            label='Survived')

    ax.set_xticks(np.arange(len(grouped[col].unique())))
    ax.set_xticklabels(list(grouped[col].unique()))
    ax.set_title(f'Survival % based on {col}')
    ax.legend()
    plt.show()

- People in first class have a higher chance of survival.
- Females have a higher chance of survival.
- The number of siblings/spouses doesn't seem to have a big impact on the survival probability.
- The number of parents/children seems to have a little impact.
- The cabin type has an impact on the survival rate.
- People embarked in C have a higher chance of survival.
- Women and children have a higher chance of survival.
- Fare category seems to have an impact on the classification.
- Being alone or not doesn´t have much impact on the survival rate.
- The age category doesn´t seem to have a big impact on the classification, as well as having ticket or not and being old with family onboard or not.


In [None]:
continue_cols = ['Age', 'Fare']

In [None]:
for col in continue_cols:
    fig, ax = plt.subplots()
    ax.hist(train_data[train_data['Survived']==0][col],
            alpha=0.6,
           label='No Survived')
    ax.hist(train_data[train_data['Survived']==1][col],
            alpha=0.6,
           label='Survived')
    ax.set_title(col)
    ax.legend()

    plt.show()

- It seems that babies/children and people between 20 and 40 had a higher rate of non survival.
- People that paid a higher fare seem to have a higher chance of survival.

# More preprocessing

In [None]:
# let's drop irrelevant columns
train_data.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [None]:
# split data and labels
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

X_val = validation_data.drop('Survived', axis=1)
y_val = validation_data['Survived']

In [None]:
# replace nan
imp_most_freq = SimpleImputer(strategy='most_frequent')

cols_most_freq = ['Embarked']

X_train[cols_most_freq] = imp_most_freq.fit_transform(X_train[cols_most_freq])

In [None]:
# encode categorical values
sex_enc = OrdinalEncoder()
cabin_enc = OrdinalEncoder()
embarked_enc = OrdinalEncoder()

X_train['Sex'] = sex_enc.fit_transform(X_train[['Sex']])
X_train['Cabin_letter'] = cabin_enc.fit_transform(X_train[['Cabin_letter']])
X_train['Embarked'] = embarked_enc.fit_transform(X_train[['Embarked']])

In [None]:
# imputation with knn
imputer = KNNImputer()
cols = X_train.columns
X_train = imputer.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=cols)

In [None]:
# normalize
scaler = MinMaxScaler()

cols = X_train.columns
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=cols)

In [None]:
# look for outliers
iso = IsolationForest(contamination=0.05, random_state=111).fit(X_train)

outliers = iso.predict(X_train)

In [None]:
# remove outliers
mask = outliers != -1
X_train, y_train = X_train[mask], y_train[mask]

In [None]:
# balance classes (downsampling)
train_data = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
survived = train_data[train_data['Survived']==1]
no_survived = train_data[train_data['Survived']==0]
no_survived_downsampled = resample(no_survived,
                                   replace=False,
                                   n_samples=len(survived),
                                   random_state=111
                                  )

train_data = pd.concat([no_survived_downsampled, survived]).reset_index(drop=True)

X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

In [None]:
def transform_data(X, fit):
    
    # drop columns
    X.drop(['PassengerId', 'Name'], axis=1, inplace=True)
    # add cols
    X['Alone'] = 0
    X.loc[(X['SibSp']==0) & (X['Parch']==0), 'Alone'] = 1
    X['Woman_or_child'] = 0
    X.loc[(X['Sex']=='female') | (X['Age']<14), 'Woman_or_child'] = 1
    X['Family_size'] = X['Parch'] + X['SibSp']
    # tranform cabin data
    X['Cabin_letter'] = X['Cabin'].replace(np.nan, ' ').map(lambda x: x[0])
    X.drop('Cabin', axis=1, inplace=True)
    # categorize age
    X['Age_category'] = X['Age'].apply(lambda x: categorize_age(x))
    # categorize fare
    X['Fare_category'] = X['Fare'].apply(lambda x: categorize_fare(x))
    # ticket transform
    mask = X['Ticket'].isna()
    X['Ticket'] = 1
    X.loc[mask, 'Ticket'] = 0
    # old with family
    X['Old_with_family'] = 0
    X.loc[(X['Age_category']==3) & (X['Alone']==0), 'Old_with_family'] = 1
    # imputation most freq
    imp_most_freq = SimpleImputer(strategy='most_frequent')
    X[cols_most_freq] = imp_most_freq.fit_transform(X[cols_most_freq])
    # encode categorical
    if fit:
        X['Sex'] = sex_enc.fit_transform(X[['Sex']])
        X['Cabin_letter'] = cabin_enc.fit_transform(X[['Cabin_letter']])
        X['Embarked'] = embarked_enc.fit_transform(X[['Embarked']])
    else:
        X['Sex'] = sex_enc.transform(X[['Sex']])
        X['Cabin_letter'] = cabin_enc.transform(X[['Cabin_letter']])
        X['Embarked'] = embarked_enc.transform(X[['Embarked']])
    # imput knn
    if fit:
        X = imputer.fit_transform(X)
    else:
        X = imputer.transform(X)
    # scaling
    if fit:
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)
    X = pd.DataFrame(X, columns=cols)
    
    return X

In [None]:
# transform validation data
X_val = transform_data(X_val, fit=False)

In [None]:
# feature selection
model = CatBoostClassifier(iterations=500,
                            depth=6,
                            verbose=False,
                           random_state=111
                           )
model.fit(X_train,y_train)

In [None]:
for i in zip(X_train.columns, model.feature_importances_):
    print(i[0],':', i[1])

In [None]:
X_train.drop(['SibSp', 'Parch', 'Ticket', 'Alone', 'Age_category', 'Fare_category', 'Old_with_family'], axis=1, inplace=True)
X_val.drop(['SibSp', 'Parch', 'Ticket', 'Alone', 'Age_category', 'Fare_category', 'Old_with_family'], axis=1, inplace=True)

# Modeling

In [None]:
# linear discriminant
ld_clf = LinearDiscriminantAnalysis()
ld_clf.fit(X_train, y_train)

print("TRAIN Accuracy linear discriminant:", ld_clf.score(X_train, y_train))
print("VALIDATION Accuracy linear discriminant:", ld_clf.score(X_val, y_val))

In [None]:
# random forest
model = RandomForestClassifier(n_jobs=-1, random_state=111)
params = {
    'n_estimators': [300, 500, 700],
    'max_depth': [5, 10, 15],
    'min_samples_split': [32,64],
    'min_samples_leaf': [10,15]
}
rf_clf = GridSearchCV(model, params, cv=3)
rf_clf.fit(X_train, y_train)

In [None]:
print("Best parameters:", rf_clf.best_params_)
print("TRAIN Accuracy random forest:", rf_clf.score(X_train, y_train))
print("VALIDATION Accuracy random forest:", rf_clf.score(X_val, y_val))

In [None]:
# knn
model = KNeighborsClassifier()
params = {
    'n_neighbors': [11, 13, 15, 17]
}
knn_clf = GridSearchCV(model, params, cv=3)
knn_clf.fit(X_train, y_train)

In [None]:
print("Best parameters:", knn_clf.best_params_)
print("TRAIN Accuracy KNN:", knn_clf.score(X_train, y_train))
print("VALIDATION Accuracy KNN:", knn_clf.score(X_val, y_val))

In [None]:
# XGBoost
model = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, eta=0.1, seed=111)
params = {
    'gamma': [0.5, 1, 3],
    'max_depth': [3, 5],
    'n_estimators': [150, 200, 300]
}
xgb_clf = GridSearchCV(model, params, cv=3)
xgb_clf.fit(X_train, y_train)

In [None]:
print("Best parameters:", xgb_clf.best_params_)
print("TRAIN Accuracy XGBoost:", xgb_clf.score(X_train, y_train))
print("VALIDATION Accuracy XGBoost:", xgb_clf.score(X_val, y_val))

In [None]:
# neural network
model = MLPClassifier(random_state=111)
params = {
    'hidden_layer_sizes': [(32,32), (64,64)],
    'learning_rate_init': [0.001, 0.01, 0.1]
}
mlp_clf = GridSearchCV(model, params, cv=3)
mlp_clf.fit(X_train, y_train)

In [None]:
print("Best parameters:", mlp_clf.best_params_)
print("TRAIN Accuracy MLP:", mlp_clf.score(X_train, y_train))
print("VALIDATION Accuracy MLP:", mlp_clf.score(X_val, y_val))

In [None]:
# catboost
model = CatBoostClassifier(verbose=False, random_state=111)
params = {
    'iterations': [100, 200, 300],
    'eta': [1, 0.1, 0.01],
    'depth': [3, 4, 6]
}
cb_clf = GridSearchCV(model, params, cv=3)
cb_clf.fit(X_train, y_train)

In [None]:
print("Best parameters:", cb_clf.best_params_)
print("TRAIN Accuracy CatBoost:", cb_clf.score(X_train, y_train))
print("VALIDATION Accuracy CatBoost:", cb_clf.score(X_val, y_val))

In [None]:
rf_clf2 = RandomForestClassifier(n_jobs=-1, random_state=111,
                                 n_estimators=rf_clf.best_params_['n_estimators'],
                                 max_depth=rf_clf.best_params_['max_depth'],
                                 min_samples_split=rf_clf.best_params_['min_samples_split'],
                                 min_samples_leaf=rf_clf.best_params_['min_samples_leaf']
                                )
xgb_clf2 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, eta=0.1, seed=111,
                             n_estimators=xgb_clf.best_params_['n_estimators'],
                             gamma=xgb_clf.best_params_['gamma'],
                             max_depth=xgb_clf.best_params_['max_depth']
                            )
mlp_clf2 = MLPClassifier(random_state=111,
                         hidden_layer_sizes=mlp_clf.best_params_['hidden_layer_sizes'],
                         learning_rate_init=mlp_clf.best_params_['learning_rate_init']
                        )
cb_clf2 = CatBoostClassifier(verbose=False, random_state=111,
                             iterations=cb_clf.best_params_['iterations'],
                             eta=cb_clf.best_params_['eta'],
                             depth=cb_clf.best_params_['depth']
                            )
ensemble_clf = VotingClassifier(
    estimators=[('rf',rf_clf2), ('xgb',xgb_clf2), ('mlp',mlp_clf2), ('cb', cb_clf2)],
    voting='soft'
)

In [None]:
ensemble_clf.fit(X_train, y_train)

In [None]:
print("TRAIN Accuracy ensemble:", ensemble_clf.score(X_train, y_train))
print("VALIDATION Accuracy ensemble:", ensemble_clf.score(X_val, y_val))

# Training with all data

In [None]:
pas_id = test_data['PassengerId']

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
# split labels
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
# transform data
X_train = transform_data(X_train, fit=True)
test_data = transform_data(test_data, fit=False)
# remove outliers
iso = IsolationForest(contamination=0.05, random_state=111).fit(X_train)
outliers = iso.predict(X_train)
mask = outliers != -1
X_train, y_train = X_train[mask], y_train[mask]
# downsampling
train_data = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
survived = train_data[train_data['Survived']==1]
no_survived = train_data[train_data['Survived']==0]
no_survived_downsampled = resample(no_survived,
                                   replace=False,
                                   n_samples=len(survived),
                                   random_state=111
                                  )

train_data = pd.concat([no_survived_downsampled, survived]).reset_index(drop=True)
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

In [None]:
X_train.drop(['SibSp', 'Parch', 'Ticket', 'Alone', 'Age_category', 'Fare_category', 'Old_with_family'], axis=1, inplace=True)
test_data.drop(['SibSp', 'Parch', 'Ticket', 'Alone', 'Age_category', 'Fare_category', 'Old_with_family'], axis=1, inplace=True)

In [None]:
rf_clf2 = RandomForestClassifier(n_jobs=-1, random_state=111,
                                 n_estimators=rf_clf.best_params_['n_estimators'],
                                 max_depth=rf_clf.best_params_['max_depth'],
                                 min_samples_split=rf_clf.best_params_['min_samples_split'],
                                 min_samples_leaf=rf_clf.best_params_['min_samples_leaf']
                                )
xgb_clf2 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, eta=0.1, seed=111,
                             n_estimators=xgb_clf.best_params_['n_estimators'],
                             gamma=xgb_clf.best_params_['gamma'],
                             max_depth=xgb_clf.best_params_['max_depth']
                            )
mlp_clf2 = MLPClassifier(random_state=111,
                         hidden_layer_sizes=mlp_clf.best_params_['hidden_layer_sizes'],
                         learning_rate_init=mlp_clf.best_params_['learning_rate_init']
                        )
cb_clf2 = CatBoostClassifier(verbose=False, random_state=111,
                             iterations=cb_clf.best_params_['iterations'],
                             eta=cb_clf.best_params_['eta'],
                             depth=cb_clf.best_params_['depth']
                            )
ensemble_clf = VotingClassifier(
    estimators=[('rf',rf_clf2), ('xgb',xgb_clf2), ('mlp',mlp_clf2), ('cb', cb_clf2)],
    voting='soft'
)

In [None]:
ensemble_clf.fit(X_train, y_train)

In [None]:
pred = ensemble_clf.predict(test_data)

In [None]:
final_results = pd.DataFrame()
final_results['PassengerId'] = pas_id
final_results['Survived'] = pred
final_results

In [None]:
final_results.to_csv('results.csv', index=False)