In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMRegressor
import optuna
import os
import sys
import time
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

warnings.filterwarnings("ignore")

INPUT_PATH = '../input/tabular-playground-series-apr-2021/'
OUTPUT_PATH = './'
SEED = 2021

# Loading datasets

In [None]:
train_df = pd.read_csv(INPUT_PATH + 'train.csv')
test_df = pd.read_csv(INPUT_PATH + 'test.csv')
sample_submission = pd.read_csv(INPUT_PATH + 'sample_submission.csv')
target_df = train_df.pop('Survived')

train_df

# Feature Engineering and data visualization

In this section we will create new features based on the current ones, fill the missing data and encode the parameters

In [None]:
print('Length of the dataframe:', len(train_df))
print('Missing values:')
train_df.isna().sum()

In [None]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev', 'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess', 'Don', 'Jonkheer']

for df in list([train_df, test_df]):
    df['Cabin_class'] = df['Cabin'].str.extract(r'([a-zA-Z])')
    df['Cabin_number'] = df['Cabin'].str.extract(r'(\d+)')
    df['Ticket_prefix'] = df['Ticket'].str.extract(r'([a-zA-Z.]+)')
    df['Ticket_number'] = df['Ticket'].str.extract(r'(\d+)')
    df['NameLen'] = df['Name'].str.len() - 1
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = df['FamilySize'] == 1
    df['FirstName'] = df['Name'].map(lambda x: x.split(', ')[1])
    df['LastName'] = df['Name'].map(lambda x: x.split(', ')[0])
    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))
    df['Title'] = df.apply(replace_titles, axis=1)
    df['Age*Class'] = df['Age'] * df['Pclass']
    df['FarePerPerson'] = df['Fare'] / (df['FamilySize'] + 1)
    
    # It seems like those with no cabin class had also no cabin number and ticket prefix
    df['Cabin_class'].fillna('NoCabin', inplace = True)
    df['Cabin_number'].fillna(0, inplace = True)
    df['Ticket_prefix'].fillna('', inplace = True)
    
    
    # Those who had no title were just normal people
    df['Title'].fillna('', inplace = True)

test_df

We will see the relationship between CabinClass and FarePerPerson in order to think how can we replace the missing values of the cabin class

In [None]:
print(train_df['Cabin_class'].unique())
sns.scatterplot(data = train_df, x = 'Cabin_class', y = 'FarePerPerson', hue = 'Pclass')

Drop features which we thing that are not needed

In [None]:
train_df = train_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis = 1)
passenger_ids = test_df['PassengerId']
test_df = test_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis = 1)
train_df

Some data preprocessing 

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_columns = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Cabin_number', 'Ticket_number', 'NameLen', 'FamilySize', 'IsAlone',
                   'Age*Class', 'FarePerPerson']
ordinal_columns = ['Ticket_prefix', 'Cabin_class', 'Embarked', 'FirstName', 'LastName', 'Title']
one_hot_columns = ['Sex']

total_df = pd.concat([train_df, test_df])

for c in ordinal_columns:
    most_frequent_value = train_df[c].value_counts().index[0]
    train_df = train_df.fillna(value = {c: most_frequent_value})
    test_df = test_df.fillna(value = {c: most_frequent_value})

enc = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)

enc.fit(train_df[ordinal_columns])
train_df[ordinal_columns] = enc.transform(train_df[ordinal_columns])
test_df[ordinal_columns] = enc.transform(test_df[ordinal_columns])
    
preprocessing_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers = [
            ('num', SimpleImputer(strategy = 'most_frequent'), numeric_columns + ordinal_columns),
            ('one_hot', OneHotEncoder(), one_hot_columns),
            #('cat', SimpleImputer(strategy = 'most_frequent'), one_hot_columns),
        ]
    )),
    ('std_scaler', StandardScaler())
])

train_df_preprocessed = preprocessing_pipeline.fit_transform(train_df)
test_df_preprocessed = preprocessing_pipeline.transform(test_df)


columns_new = numeric_columns + ordinal_columns + list(preprocessing_pipeline['preprocessor'].named_transformers_['one_hot'].get_feature_names(one_hot_columns))
columns_new

In [None]:
train_df_preprocessed = pd.DataFrame(train_df_preprocessed, columns = columns_new)
test_df_preprocessed = pd.DataFrame(test_df_preprocessed, columns = columns_new)
train_df_preprocessed

Stratified train test split seems to be giving better results

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df_preprocessed, target_df, train_size = 0.8, stratify = target_df, random_state = SEED)

# Check features performance with Boruta

We will see which features contribute the most to our model and choose if we want to remove some of them based on their performance

In [None]:
from boruta import BorutaPy

feat_selector = BorutaPy(
    XGBRegressor(tree_method = 'gpu_hist', sampling_method = 'gradient_based', n_estimators = 200, learning_rate = 0.05, n_jobs = 4),
    n_estimators = 'auto',
    verbose = 2,
    random_state = 1
)

feat_selector.fit(X_train.values, y_train.values)

feature_names = columns_new
feature_ranks = list(zip(feature_names, feat_selector.ranking_, feat_selector.support_))
feature_ranks

We will remove the features that Boruta shows us that are not needed

In [None]:
cols_to_remove = []
for feat in feature_ranks:
    if feat[2] == False: #and feat[1] >= 3:
        cols_to_remove.append(feat[0])
        pass

print('Dropping the following features:', cols_to_remove)

X_train_filtered = X_train.drop(cols_to_remove, axis = 1)
X_valid_filtered = X_valid.drop(cols_to_remove, axis = 1)
test_df_filtered = test_df_preprocessed.drop(cols_to_remove, axis = 1)

# Search for the best features with Optuna

We will get the best features for XGBoost with a hyperparameter optimizer

In [None]:
from sklearn.metrics import accuracy_score, f1_score

best_params = None

"""
import optuna

def objective(trial, X_train = X_train_filtered, X_valid = X_valid_filtered, y_train = y_train, y_valid = y_valid):
    obj_params = {'random_state': SEED,
                  #'eval_metric': 'error',
                  'tree_method': 'gpu_hist',
                  'verbosity': 0,
                  'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
                  'alpha': trial.suggest_float('alpha', 0, 10),
                  'gamma': trial.suggest_float('gamma', 0, 10),
                  'lambda': trial.suggest_float('lambda', 1, 10),
                  'min_child_weight': trial.suggest_float('min_child_weight', 0, 10),
                  'max_delta_step': trial.suggest_float('max_delta_step', 0, 10),
                  'max_depth': trial.suggest_int('max_depth', 2, 12),
                  'subsample': trial.suggest_float('subsample', 0.01, 1),
                  'colsample_by_tree': trial.suggest_float('colsample_by_tree', 0.01, 1),
                  'colsample_by_level': trial.suggest_float('colsample_by_level', 0.01, 1),
                  'colsample_by_node': trial.suggest_float('colsample_by_node', 0.01, 1),
                  'sampling_method': trial.suggest_categorical('sampling_method', ['uniform', 'gradient_based']),
                  'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.08),
                  #'interaction_constraints': trial.suggest...,
                  #'scale_pos_weight ': trial.suggest...,
                  }
    
    fit_params = {
        'verbose': False,
        'early_stopping_rounds': 5,        
        'eval_set': [(
            X_valid,
            y_valid,
        )],
    }
    
    
    obj_model = XGBRegressor(**obj_params)
    obj_model.fit(X_train, y_train, **fit_params)
    obj_preds = list(np.round(np.array(obj_model.predict(X_valid_filtered)),0))
    
    obj_score = accuracy_score(y_valid, obj_preds)
    return obj_score

start_time = time.time()
study = optuna.create_study(study_name = f"optimization", direction = 'maximize')
study.optimize(objective, 3000)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_trial.value)

best_params = study.best_trial.params
"""

# Train the model and submit

In [None]:
start_time = time.time()

if best_params is None:
    best_params = {'n_estimators': 683, 'alpha': 8.422354464381405, 'gamma': 0.2564422701451709, 'lambda': 5.332526708685053, 'min_child_weight': 3.748293558015677, 'max_delta_step': 1.3052043989118083, 'max_depth': 7, 'subsample': 0.5079819093322351, 'colsample_by_tree': 0.37665321750268915, 'colsample_by_level': 0.29014453377484806, 'colsample_by_node': 0.2490993927786025, 'sampling_method': 'uniform', 'learning_rate': 0.028783844683866237}
    
model_pipeline = Pipeline([
    ('model', XGBRegressor(
        random_state = SEED,
        n_jobs = 4,
        verbosity = 0,
        #eval_method = 'error',
        tree_method = 'gpu_hist',
        #objective = 'binary:logistic',
        **best_params
    ))
])

model_pipeline.fit(X_train_filtered, y_train, model__early_stopping_rounds = 5, model__eval_set = [(X_valid_filtered, y_valid)], model__verbose = False)
score = accuracy_score(y_valid, list(np.round(np.array(model_pipeline.predict(X_valid_filtered)),0)))
print('Elapsed time: {} seconds:'.format(time.time() - start_time))
print('Score:', score)
y_test = model_pipeline.predict(test_df_filtered)

y_test[y_test >= 0.5] = 1
y_test[y_test < 0.5] = 0
y_test = y_test.astype(int)

Logistic regression seems to not perform as good as XGBoost

In [None]:
"""
from sklearn.linear_model import LogisticRegression

start_time = time.time()
# Instantiate our model
logReg = LogisticRegression()
# Fit our model to the training data
logReg.fit(X_train_filtered, y_train)
# Predict on the test data
score = roc_auc_score(y_valid, logReg.predict(X_valid_filtered))
print('Elapsed time: {} seconds:'.format(time.time() - start_time))
print('Score:', score)
y_test = logReg.predict(test_df_filtered)
"""

In [None]:
submission = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': y_test})
submission.to_csv('submission.csv', index = False)
submission