# Importing modules and libraries

In [None]:
# to read and manipulate data
import pandas as pd
import numpy as np

# to add a general preprocessing step to the data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# to prepare the model
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

# to model the data
from sklearn.metrics import accuracy_score, log_loss, roc_curve, fbeta_score
from sklearn.linear_model import LogisticRegression

# for hyperparameter optimization
import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances

# to warnings off
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

# Defining utility functions

In [None]:
# function to create a model instance given the hyperparameters
def instantiate_model(hyperparameters):
    # creating an instance of the model
    model = LogisticRegression(**hyperparameters)
    # returning the model
    return model



# function to fit an instantiated model
def fit_model(model, X, y):
    # fitting the model to the data
    model.fit(X, y)
    # returning the trained model
    return model


# function to extract the best probability threshold according to the roc curve
def extract_roc_threshold(y_obs, probs):
    # extracting fpr, tpr and thresholds from the roc curve
    fpr, tpr, thresholds = roc_curve(y_true = y_obs, y_score = probs)
    # calculating the gmeans for each of the fpr and tpr from the roc curve
    gmeans = np.sqrt(tpr * (1 - fpr))
    # identifying the element with the highest gmeans - best trade-off between fpr and tpr
    best_threshold = np.argmax(gmeans)
    # returning the probability threshold that maximizes the gmeans
    return thresholds[best_threshold]


# function to encode categories for a selected probability threshold
def move_threshold(probabilities, threshold):
    return (probabilities >= threshold).astype('int')


# function to extract the best probability threshold according to the aucpr curve
def extract_fbeta_threshold(y_obs, probs, beta_value):
    # defining an array of thresholds
    thresholds = np.arange(0, 1, 0.01)
    # calculating the fbeta score for each threshold
    f_scores = [fbeta_score(y_true = y_obs, y_pred = move_threshold(probs, threshold), beta = np.float(beta_value)) for threshold in thresholds]
    # identifying the element with the highest fbeta score - best trade-off between precision and recall
    best_threshold = np.argmax(f_scores)
    # returning the probability threshold that maximizes the fbeta score
    return thresholds[best_threshold]


# function to evaluate a trained model
def score_model(model, X_eval, y_eval, scoring):
    
    # fitting the model trained model to the data to obtain predicted probabilities
    predicted_probabilities = model.predict_proba(X_eval)[:, 1]
    
    # use the traditional approach
    if scoring == 'traditional':
        # predicting the class label with the traditional approach
        selected_threshold = 0.5
        predicted_classes = model.predict(X_eval)
    # use the ROC AUC curve to define the threshold and set the class
    elif scoring == 'auc':
        selected_threshold = extract_roc_threshold(y_obs = y_eval, probs = predicted_probabilities)
        predicted_classes = move_threshold(probabilities = predicted_probabilities, threshold = selected_threshold)
    # use the F-beta define the threshold and set the class
    else:
        selected_threshold = extract_fbeta_threshold(y_obs = y_eval, probs = predicted_probabilities, beta_value = scoring)
        predicted_classes = move_threshold(probabilities = predicted_probabilities, threshold = selected_threshold)
    
    # calculating the accuracy score of the model
    acc = accuracy_score(y_true = y_eval, y_pred = predicted_classes)
    
    # calculating the log loss of the model
    ll = log_loss(y_true = y_eval, y_pred = predicted_probabilities)
    
    # returning the trained model
    return acc, ll, selected_threshold


# objective function for optuna to evaluate
def objective(trial):
    
    # search space
    search_space = {'hyperparams': {'C': trial.suggest_float('C', 0.001, 1.0),
                                    'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])},
                    'imputation': {'fare_imputation': trial.suggest_categorical('fare_imputation', ['mean', 'median']),
                                   'age_imputation': trial.suggest_categorical('age_imputation', ['mean', 'median'])},
                    'transform': {'log_fare': trial.suggest_categorical('log_fare', ['yes', 'no']),
                                  'ohe': trial.suggest_categorical('ohe', ['yes', 'no']),
                                  'interactions': trial.suggest_categorical('interactions', ['yes', 'no'])},
                    'scoring': trial.suggest_categorical('scoring', ['traditional', 'auc', '0.5', '1.0', '1.5']),
                    'pseudo_label': trial.suggest_categorical('pseudo_label', [None, 'lr', 'tabnet'])
                   }
    
    # fit SKF
    skf_evaluated = evaluate_stratifield_kfold(X, y, kfolds = kfolds, hyperpars = search_space)
    
    # unpack accuracy and loss
    acc = skf_evaluated['val_accuracy'].mean()
    
    # return scores
    return acc


# function to create a model pipeline
def create_pipeline(hyperpars):
    # imputing the defined fare imputation strategy and the knn strategy if this is the case
    if hyperpars['imputation']['age_imputation'] == 'knn':
        imputation = ColumnTransformer([('imputation_fare', SimpleImputer(missing_values = np.nan, strategy=hyperpars['imputation']['fare_imputation'], copy=False), ['Fare']),
                                        ('imputation_age', KNNImputer(missing_values = np.nan, copy=False), ['Age'])],
                                       remainder = 'passthrough')
    # otherwise, input the defined strategy for the age imputation   
    else:
        imputation = ColumnTransformer([('imputation_fare', SimpleImputer(missing_values = np.nan, strategy=hyperpars['imputation']['fare_imputation'], copy=False), ['Fare']),
                                        ('imputation_age', SimpleImputer(missing_values = np.nan, strategy=hyperpars['imputation']['age_imputation'], copy=False), ['Age'])],
                                       remainder = 'passthrough')
    
    # instantiating a standard scaler for the numerical columns
    scaler = ColumnTransformer([('scaler', StandardScaler(), num_columns_indexes)], remainder = 'passthrough')
    
    # defining whether we are transforming the fare variable
    if hyperpars['transform']['log_fare'] == 'yes':
        transformation = ColumnTransformer([('log_fare', FunctionTransformer(np.log1p), [0])], remainder = 'passthrough')
    else: 
        transformation = ColumnTransformer([('log_fare', FunctionTransformer(), [0])], remainder = 'passthrough')
    
    # defining whether we are ohe hot encoding variables or not
    if hyperpars['transform']['ohe'] == 'yes':
        ohe = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown = 'error', drop = 'first'), cat_columns_indexes)], remainder = 'passthrough')
    else:
        ohe = ColumnTransformer([('ohe', FunctionTransformer(), cat_columns_indexes)], remainder = 'passthrough')
        
    # defining whether we are considering interactions or not
    if hyperpars['transform']['interactions'] == 'yes':
        poly = ColumnTransformer([('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), num_columns_indexes)], remainder = 'passthrough')
    else:
        poly = ColumnTransformer([('poly', FunctionTransformer(), num_columns_indexes)], remainder = 'passthrough')
    
    # instantiating the model
    model = instantiate_model(hyperparameters = hyperpars['hyperparams'])
    
    # instantiating the pipeline
    pipeline = Pipeline(steps = [('imputation', imputation), ('scaler', scaler), ('log_fare', transformation), ('encoding', ohe), ('poly', poly), ('model', model)])
    
    # returning the pipeline
    return pipeline



# function to run the stratified kfold with the selected hyperparameters
def evaluate_stratifield_kfold(X, y, kfolds, hyperpars):
    print(f'\n------------- Initializing the Stratified {kfolds}-fold evaluation -------------')
    # creating empty lists to store the accuracy and log loss
    fold_scores = []
    
    # printing the hyperparameters under evaluation
    print('Evaluating the following hyperparameters:')
    print(hyperpars)
    
    # instanting the stratified k-fold
    skf = StratifiedKFold(n_splits = kfolds, random_state = 42, shuffle = True)
    
    # deciding whether past submissions should be used and which ones
    if hyperpars['pseudo_label'] == 'lr': # logistic regression submission labels
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test.shape[0]), size = np.int(X_test.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test.iloc[test_idx_leak], y_lr[test_idx_leak]
    if hyperpars['pseudo_label'] == 'tabnet':
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test.shape[0]), size = np.int(X_test.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test.iloc[test_idx_leak], y_tabnet[test_idx_leak]
        
    # unpacking the statified k-fold
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        # separating the train from the test folds
        X_train_fold, X_val_fold, y_train_fold, y_val_fold = X.iloc[train_idx], X.iloc[test_idx], y[train_idx], y[test_idx]
     
        # mergining the test dataset leak to the train set if this approach should be used
        if hyperpars['pseudo_label'] is not None:
            # merging and replacing the fold objects
            X_train_fold, y_train_fold = pd.concat([X_train_fold, X_leak]), np.hstack((y_train_fold, y_leak))
            # shuffling the sequence of observations
            shuffled_indices = np.random.choice(a = range(X_train_fold.shape[0]), size = X_train_fold.shape[0], replace = False)
            # rearranging x and y
            X_train_fold, y_train_fold = X_train_fold.iloc[shuffled_indices], y_train_fold[shuffled_indices]
        
        # pipeline
        model = create_pipeline(hyperpars = hyperpars)
        
        # fitting the model to the data
        model = fit_model(model = model, X = X_train_fold, y = y_train_fold)
        
        # extracting the train score
        train_acc, train_loss, _ = score_model(model = model, X_eval = X_train_fold, y_eval = y_train_fold, scoring = hyperpars['scoring'])
        # extracting the validation score
        val_acc, val_loss, _ = score_model(model = model, X_eval = X_val_fold, y_eval = y_val_fold, scoring = hyperpars['scoring'])
        
        # appending accuracy and log loss to the list
        fold_scores.append((train_acc, val_acc, train_loss, val_loss))
        
        # printing the results of this fold
        print(f"> Fold {fold + 1}: Training loss: {train_loss} | Training accuracy: {train_acc} | Validation loss: {val_loss} | Validation accuracy: {val_acc}")
    
    # structuring the training and validation scores in a dataframe
    scores = pd.DataFrame(fold_scores, columns=['train_accuracy', 'val_accuracy', 'train_loss', 'val_loss'])
    
    # printing average scores for this run
    print(f"End of Run!\nTraining loss: {scores['train_loss'].mean()} | Training accuracy: {scores['train_accuracy'].mean()} | Validation loss: {scores['val_loss'].mean()} | Validation accuracy: {scores['val_accuracy'].mean()}")
    
    # returning the scores
    return scores


# function to create the pipeline that will be used to make the submission
def final_pipeline(hyperpars):
    # imputing the defined fare imputation strategy and the knn strategy if this is the case
    if hyperpars['age_imputation'] == 'knn':
        imputation = ColumnTransformer([('imputation_fare', SimpleImputer(missing_values = np.nan, strategy=hyperpars['fare_imputation'], copy=False), ['Fare']),
                                        ('imputation_age', KNNImputer(missing_values = np.nan, copy=False), ['Age'])],
                                       remainder = 'passthrough')
    # otherwise, input the defined strategy for the age imputation   
    else:
        imputation = ColumnTransformer([('imputation_fare', SimpleImputer(missing_values = np.nan, strategy=hyperpars['fare_imputation'], copy=False), ['Fare']),
                                        ('imputation_age', SimpleImputer(missing_values = np.nan, strategy=hyperpars['age_imputation'], copy=False), ['Age'])],
                                       remainder = 'passthrough')
    
    # instantiating a standard scaler for the numerical columns
    scaler = ColumnTransformer([('scaler', StandardScaler(), num_columns_indexes)], remainder = 'passthrough')
    
    # defining whether we are transforming the fare variable
    if hyperpars['log_fare'] == 'yes':
        transformation = ColumnTransformer([('log_fare', FunctionTransformer(np.log1p), [0])], remainder = 'passthrough')
    else: 
        transformation = ColumnTransformer([('log_fare', FunctionTransformer(), [0])], remainder = 'passthrough')
    
    # defining whether we are ohe hot encoding variables or not
    if hyperpars['ohe'] == 'yes':
        ohe = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown = 'error', drop = 'first'), cat_columns_indexes)], remainder = 'passthrough')
    else:
        ohe = ColumnTransformer([('ohe', FunctionTransformer(), cat_columns_indexes)], remainder = 'passthrough')
    
    # defining whether we are considering interactions or not
    if hyperpars['interactions'] == 'yes':
        poly = ColumnTransformer([('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), num_columns_indexes)], remainder = 'passthrough')
    else:
        poly = ColumnTransformer([('poly', FunctionTransformer(), num_columns_indexes)], remainder = 'passthrough')
        
    # instantiating the model
    model = LogisticRegression(C = hyperpars['C'], class_weight = hyperpars['class_weight'])
    
    # instantiating the pipeline
    pipeline = Pipeline(steps = [('imputation', imputation), ('scaler', scaler), ('log_fare', transformation), ('encoding', ohe), ('poly', poly), ('model', model)])
    
    # returning the pipeline
    return pipeline


# function to run the model with the best hyperparameters and submit it
def submit_stratifield_kfold(X, y, kfolds, hyperpars):
    print(f'\n------------- Initializing the Stratified {kfolds}-fold evaluation -------------')
    # creating empty lists to store the accuracy and log loss
    fold_scores = []
    
    # creating an empty dictionary to store the models
    fitted_models = {}
    
    # creating empty lists to store the thresholds
    threshold_list = []
    
    # creating a numpy array to store probabilities
    probas = np.zeros(len(X_test))
    
    # printing the hyperparameters under evaluation
    print('Evaluating the following hyperparameters:')
    print(hyperpars)
    
    # instanting the stratified k-fold
    skf = StratifiedKFold(n_splits = kfolds, random_state = 42, shuffle = True)
    
    # deciding whether past submissions should be used and which ones
    if hyperpars['pseudo_label'] == 'lr': # logistic regression submission labels
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test.shape[0]), size = np.int(X_test.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test.iloc[test_idx_leak], y_lr[test_idx_leak]
    if hyperpars['pseudo_label'] == 'tabnet':
        # sampling the indices of the test dataframe that will be used
        test_idx_leak = np.random.choice(a = range(X_test.shape[0]), size = np.int(X_test.shape[0] * 0.4), replace = False)
        # getting the indexes of the test dataframe
        X_leak, y_leak = X_test.iloc[test_idx_leak], y_tabnet[test_idx_leak]
        
    # unpacking the statified k-fold
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        # separating the train from the test folds
        X_train_fold, X_val_fold, y_train_fold, y_val_fold = X.iloc[train_idx], X.iloc[test_idx], y[train_idx], y[test_idx]
        
        # mergining the test dataset leak to the train set if this approach should be used
        if hyperpars['pseudo_label'] is not None:
            # merging and replacing the fold objects
            X_train_fold, y_train_fold = pd.concat([X_train_fold, X_leak]), np.hstack((y_train_fold, y_leak))
            # shuffling the sequence of observations
            shuffled_indices = np.random.choice(a = range(X_train_fold.shape[0]), size = X_train_fold.shape[0], replace = False)
            # rearranging x and y
            X_train_fold, y_train_fold = X_train_fold.iloc[shuffled_indices], y_train_fold[shuffled_indices]
            
        # pipeline
        model = final_pipeline(hyperpars = hyperpars)
        
        # fitting the model to the data
        model = fit_model(model = model, X = X_train_fold, y = y_train_fold)
        
        # extracting the train score
        train_acc, train_loss, _ = score_model(model = model, X_eval = X_train_fold, y_eval = y_train_fold, scoring = hyperpars['scoring'])
        # extracting the validation score
        val_acc, val_loss, val_threshold = score_model(model = model, X_eval = X_val_fold, y_eval = y_val_fold, scoring = hyperpars['scoring'])
        
        # appending accuracy and log loss to the list
        fold_scores.append((train_acc, val_acc, train_loss, val_loss))
        
        # appending the threshold to the list
        threshold_list.append(val_threshold)
        
        # printing the results of this fold
        print(f"> Fold {fold + 1}: Training loss: {train_loss} | Training accuracy: {train_acc} | Validation loss: {val_loss} | Validation accuracy: {val_acc}")
        
        # generating predictions on the test set with the model
        probas += model.predict_proba(X_test)[:, 1]
        
        # saving the fitted model
        fitted_models[f'model_{fold}'] = model
        
    # structuring the training and validation scores in a dataframe
    scores = pd.DataFrame(fold_scores, columns=['train_accuracy', 'val_accuracy', 'train_loss', 'val_loss'])
    
    # printing average scores for this run
    print(f"End of Run!\nTraining loss: {scores['train_loss'].mean()} | Training accuracy: {scores['train_accuracy'].mean()} | Validation loss: {scores['val_loss'].mean()} | Validation accuracy: {scores['val_accuracy'].mean()}")
    
    # returning the score
    return scores, probas, threshold_list, fitted_models

# Importcat_columnsthe data

In [None]:
# train data
train = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-apr-2021/train.csv')
train.shape

In [None]:
# test data
test = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-apr-2021/test.csv')
test.shape

In [None]:
# sample submission data
submission = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-apr-2021/sample_submission.csv')
submission.shape

In [None]:
# submission from a previous logistic regression
lr_submission = pd.read_csv(filepath_or_buffer = '../input/lr-submission-tps202104/lr_submission.csv')
lr_submission.shape

In [None]:
# submission from a previous logistic regression
tabnet_submission = pd.read_csv(filepath_or_buffer = '../input/tabnetsubmissiontps202104/tabnet_submission-tps202104.csv')
tabnet_submission.shape

# Merging the datasets

In [None]:
# putting the train on top of the test set in order to create some general features
df = pd.concat([train, test])
df.shape

# Feature Engineering

In [None]:
## adding number of missing values in the rows
df['nMissing'] = df.isnull().sum(axis = 1)

# encoding the deck to which de cabin belongs to
df['Deck'] = df.Cabin.str.extract(pat = r'(^[A-Z])')
# filling missing value for deck
df['Deck'] = df.Deck.fillna('U')

# encoding whether the passenger travels alone
df['TravelsAlone'] = ((df.SibSp == 0) & (df.Parch == 0)).astype('int')

# encoding whether the passenger travels accompanied by somebody
df['TravelsTwo'] = ((df.SibSp == 1) & (df.Parch == 0) | (df.SibSp == 0) & (df.Parch == 1)).astype('int')

# calculating the size of the family
df['FamilySize'] = df.SibSp + df.Parch + 1

# creating flags for the other columns that will be imputed
df['AgeImputed'] = (~df.Age.isnull()).astype('int')
df['FareImputed'] = (~df.Fare.isnull()).astype('int')
df['EmbarkedImputed'] = (~df.Embarked.isnull()).astype('int')

# parsing Pclass to category
df['Pclass'] = df.Pclass.astype('object')

In [None]:
# creating a variable to encode the deviation of the fare by class
## log scaling the fare
df['FareClass'] = np.log(df.Fare)

## adding the average fare by class
df['AvgFare'] = df.Pclass.map(df.groupby('Pclass').FareClass.mean())

## calculating the deviation for the mean
df['FareClass'] = df.FareClass - df.AvgFare

## adding a flag to the deviation
df['FareClassFlag'] = (df.FareClass > 0).astype('int')

Encoding ticket information.

In [None]:
# fixing possible typo on the ticket column 
df['Ticket'] = df.Ticket.str.replace(pat=r'\bSTON\b', repl='SOTON', regex=True)

# extracting the letters from the ticket column
df['TicketLetters'] = df.Ticket.str.findall(pat = r'([A-Za-z]+)')

# filling NaNs with an empty list
df['TicketLetters'] = df.TicketLetters.fillna('')

# parsing the list column to a string column
df['TicketLetters'] = df.TicketLetters.apply(lambda x: ' '.join([letras for letras in x]))

# filling missing ticket letters if the string is empty
df['TicketLetters'] = df.TicketLetters.apply(lambda x: 'N' if x == '' else x)

# putting all letters to upper
df['TicketLetters'] = df.TicketLetters.str.upper()

# fixing some tickets that seems to be the same
df['TicketLetters'] = df.TicketLetters.replace(to_replace = {'C A': 'CA', 'W E P': 'WE P', 'SOTON O Q': 'SOTON OQ', 'S W PP': 'SW PP', 
                                                             'S O C': 'SO C', 'C A SOTON': 'SOTON CA', 'S C PARIS': 'SC PARIS', 'P PP': 'PP',
                                                             'S C A': 'SC A', 'S O P P': 'S O P'})

In [None]:
# extracting the ticket number
df['TicketNumber'] = df.Ticket.str.extract(pat=r'(?<=\s)?([0-9]+)$').astype('float')

# filling missing values in the ticket number
df['TicketNumber'] = df.TicketNumber.fillna(0)

# number of digits
df['TicketDigits'] = df.TicketNumber.astype('int').astype('str').str.count(pat='[0-9]')

# rounding the ticket number to an integer
df['TicketNumber'] = (df.TicketNumber / (df.TicketDigits * 1000)).astype('int')

# scaling the ticket back to its original scale
df['TicketNumber'] = df.TicketNumber * (df.TicketDigits * 1000)

# calculating the frequency of ticket numbers
frequencies_tnb = df.TicketNumber.value_counts(normalize=False)

# mapping the frequencies to ticket numbers
mask_TcktNmbFreq = df.TicketNumber.map(frequencies_tnb)

# replacing ticket numbers that are very rare by a random number
df['TicketNumber'] = df.TicketNumber.mask(mask_TcktNmbFreq < 10, 999999)

Encoding cabin data.

In [None]:
# extracting the numeric part of the cabin
df['CabinNb'] = df.Cabin.fillna('0').str.extract(pat=r'([0-9]+)').astype('int')

# extracting the count of digits in the cabin number
df['CabinDigits'] = df.CabinNb.astype('str').str.count(pat='[0-9]').astype('int')

# rounding the ticket number to an integer
df['CabinNb'] = (df.CabinNb / (df.CabinDigits * 10)).astype('int')

# scaling the ticket back to its original scale
df['CabinNb'] = df.CabinNb * (df.CabinDigits * 10)

# calculating the frequency of cabin numbers
frequencies_cbn = df.CabinNb.value_counts(normalize=False)

# mapping the frequencies to cabin numbers
mask_CabinNbFreq = df.CabinNb.map(frequencies_cbn)

# replacing cabin numbers that are very rare by a random number
df['CabinNb'] = df.CabinNb.mask(mask_CabinNbFreq < 10, 99999)

# Encoding categorical data

In [None]:
# filling NaNs
df['Embarked'] = df.Embarked.fillna('U')

In [None]:
# list of columns that will be encoded
columns_to_encode = ['Sex', 'Embarked', 'Deck', 'Pclass', 'TicketLetters']

# encoding columns
for column in columns_to_encode:
    # instantiating the encoder
    le = LabelEncoder()
    # transforming the column
    df[column] = le.fit_transform(df[column])

# Separating the two datasets, inputs and outputs

In [None]:
## dropping unwanted columns
# list of columns to drop
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']

# dropping selected columns
df = df.drop(columns = columns_to_drop)

In [None]:
## arranging columns of the dataframe
num_columns = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize', 'nMissing']
cat_columns = ['Pclass', 'Sex', 'Embarked', 'Deck', 'TravelsAlone', 'TravelsTwo', 'AgeImputed', 'FareImputed', 'EmbarkedImputed', 'FareClassFlag', 'TicketLetters', 'TicketDigits', 'CabinDigits']

# arranging the columns
df = df[num_columns + cat_columns + ['Survived']]

In [None]:
# extracting the training dataset
train_df = df[0:train.shape[0]]

# separating inputs and targets for training data
X_train, y_train = train_df.drop(columns='Survived'), train_df.Survived

## encoding target
y_train = LabelEncoder().fit_transform(y_train)

# encoding the target from the lr submission
y_lr = LabelEncoder().fit_transform(lr_submission.Survived.values)

# encoding the target from the tabnet submission
y_tabnet = LabelEncoder().fit_transform(tabnet_submission.Survived.values)

In [None]:
# extracting the test dataset
test_df = df[train.shape[0]:]

# separating inputs and targets for training data
X_test = test_df.drop(columns='Survived')

# Hyperparameter Optimization

In [None]:
# setting up the variables to run the objective
X = X_train
y = y_train
kfolds = 8

# indexes of the categorical columns
cat_columns_indexes = [cat_index for cat_index, cat_column in enumerate(X_train.columns) if cat_column in cat_columns]
num_columns_indexes = [num_index for num_index, num_column in enumerate(X_train.columns) if num_column in num_columns]

In [None]:
# creating a study
study = optuna.create_study(directions = ['maximize'], pruner = optuna.pruners.MedianPruner())

In [None]:
# adding a baseline to the study
study.enqueue_trial(
    {
        'hyperparams': {'C': 1.0,
                        'class_weight': None},
        'imputation': {'fare_imputation': 'mean',
                       'age_imputation': 'mean'},
        'transform': {'log_fare': 'no',
                      'ohe': 'no',
                      'interactions': 'no'},
        'scoring': 'traditional',
        'pseudo_label': None
    }
)

In [None]:
# running the study
study.optimize(func = objective, n_trials = 30, timeout = 60 * 30)

# Evaluating trials

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_parallel_coordinate(study)

# Fitting with the best trial

In [None]:
# extracting the best hyperparameters
best_hyperparameters = study.best_trials[0].params
print(f'Best hyperparameter combination evaluated: {best_hyperparameters}')

In [None]:
# fitting the model on the best combination of hyperparameters
last_scores, probas, thrs, model_list = submit_stratifield_kfold(X, y, kfolds = kfolds, hyperpars = best_hyperparameters)

In [None]:
print(f'Thresholds used: {thrs}.')

In [None]:
# adding predictions to the submission dataframe
submission['Survived'] = ((probas / kfolds) >= np.mean(thrs)).astype('int')

# Saving predictions

In [None]:
submission.to_csv('submission.csv', index=False)