In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    #plot_confusion_matrix,
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [3]:
# Read in training data
df = pd.read_csv("train.csv")

In [5]:
train_data = df.copy()

In [7]:
# Look at subset of data
train_data.sample(n=10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
93,94,0,3,"Dean, Mr. Bertram Frank",male,26.0,1,2,C.A. 2315,20.575,,S
372,373,0,3,"Beavan, Mr. William Thomas",male,19.0,0,0,323951,8.05,,S
476,477,0,2,"Renouf, Mr. Peter Henry",male,34.0,1,0,31027,21.0,,S
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S
207,208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C
552,553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q
392,393,0,3,"Gustafsson, Mr. Johan Birger",male,28.0,2,0,3101277,7.925,,S
790,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
32,33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q


In [9]:
# Number of rows and columns
train_data.shape

(891, 12)

In [11]:
# Information on each column
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
# Statistical description
train_data.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891.0,,,,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,,,,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891.0,891.0,"Braund, Mr. Owen Harris",1.0,,,,,,,
Sex,891.0,2.0,male,577.0,,,,,,,
Age,714.0,,,,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,,,,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891.0,681.0,347082,7.0,,,,,,,
Fare,891.0,,,,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [17]:
# Missing values for each column
train_data.isnull().sum().sort_values(ascending=False)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64

#### Observations
- Passenger ID is not required for prediction as it is not expected to be correlated to survival.
- Most of the Cabin values are null, and this feature is not expected to contribute to prediction. Feature column will be dropped.
- Name column is not expected to have any significant correlation with survival, and will be dropped.
- Ticket number is a combination of string and numeric data. It will be split into two columns, one containing the string portion, and the other containing the numeric portion.
- There are some missing values in Age and Embarked features, which will need treatment.

In [19]:
# Split Ticket into a string and corresponding numeric column
train_data[['Ticket_string', 'Ticket_num']] = train_data['Ticket'].str.extract(r'(.*?)(\d+)$')
train_data['Ticket_num'] = pd.to_numeric(train_data['Ticket_num'], errors='coerce')  # Coerce any non-numeric to NaN

In [None]:
train_data[train_data["Ticket_num"].isna()]

In [None]:
plt.hist(train_data["Ticket_num"])

In [None]:
train_data.drop(["PassengerId","Name","Ticket","Cabin","Ticket_string"],axis=1,inplace=True)

In [None]:
train_data.head()

In [None]:
# Check for class distribution
round(train_data["Survived"].value_counts(normalize=True)*100,2)

_We see that the class is imbalanced._

In [None]:
cont_vars = ["Age","Fare","Parch","SibSp","Ticket_num"]

In [None]:
cat_vars = ["Sex","Pclass","Embarked"]

In [None]:
train_data[cont_vars].describe().T

In [None]:
for var in cat_vars:
    print(f"{var}:",train_data[var].unique())

### Univariate Analysis

#### Continuous Variables

In [None]:
# function to plot a boxplot and a histogram along the same scale.


def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

In [None]:
# Observations on Passenger age
histogram_boxplot(train_data,"Age")

In [None]:
# Observations on Passenger's ticket Fare
histogram_boxplot(train_data,"Fare")

In [None]:
train_data[train_data["Fare"] > 300]

In [None]:
# Observations on number of parents/children accompanying the passengers
histogram_boxplot(train_data,"Parch")

In [None]:
# Observations on number of siblings/spouses accompanying the passengers
histogram_boxplot(train_data,"SibSp")

In [None]:
# Observations on numeric portion of the ticket number
histogram_boxplot(train_data,"Ticket_num")

#### Categorical Variables

In [None]:
# function to create labeled barplots


def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        hue=feature,
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage


In [None]:
# observations on Survival Status
labeled_barplot(train_data, "Survived")

In [None]:
# observations on Passenger Sex
labeled_barplot(train_data, "Sex")

In [None]:
# observations on Passenger Class
labeled_barplot(train_data, "Pclass")

In [None]:
# observations on Passenger Port of Embarkation
labeled_barplot(train_data, "Embarked")

#### Bivariate Analysis

In [None]:
train_data.head()

In [None]:
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(y="Age", x="Sex", data=train_data, orient="vertical")

In [None]:
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(y="Fare", x="Sex", data=train_data, orient="vertical")

##### Continuous variables against target variable

In [None]:
plt.figure(figsize=(10, 10))
for i,col in enumerate(cont_vars):
    plt.subplot(3,2,i+1)
    sns.boxplot(x="Survived",y=col,data=train_data)
    plt.tight_layout()
    plt.title(col)
plt.show()

##### Categorical variables against target variable

In [None]:
# function to plot stacked bar chart


def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
    plt.legend(
        loc="lower left",
        frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

In [None]:
stacked_barplot(train_data,"Sex","Survived")

In [None]:
stacked_barplot(train_data,"Embarked","Survived")

In [None]:
stacked_barplot(train_data,"Pclass","Survived")

#### Check for correlations amongst variables

In [None]:
sns.pairplot(train_data,hue="Survived")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [None]:
# Separating target variable and other variables
X = df.drop("Survived",axis=1)
y = df["Survived"]

In [None]:
X.head()

In [None]:
survival_ratio = sum(1 for val in y if val == 1)/len(y)
print(f"Survival ratio: {survival_ratio:.2%}")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state=1,stratify=y)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
survival_ratio_train = sum(1 for val in y_train if val == 1)/len(y_train)
survival_ratio_val = sum(1 for val in y_val if val == 1)/len(y_val)
print(f"Survival ratio in training data: {survival_ratio_train:.2%}")
print(f"Survival ratio in validation data: {survival_ratio_val:.2%}")

#### Pipeline (Column Transformation, One Hot Encoding, Imputation)

* We will drop/modify columns to prepare for new data
* We will perform one hot encoding on categorical variables
* We will use median to impute missing values in Age column.
* We will use most_frequent value to impute missing values in Embarked column.

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self,columns_to_drop=[]):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self  # No fitting is required for dropping columns

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop,axis=1)

class ColumnSplitter(BaseEstimator, TransformerMixin):

    def __init__(self,column,new_columns,drop_columns=None,modify_column=None,modify_function=None):
        self.column = column
        self.new_columns = new_columns
        self.drop_columns = drop_columns
        self.modify_column = modify_column
        self.modify_function = modify_function

    def fit(self,X,y=None):
        return self  # No fitting is required for dropping columns

    def transform(self,X):
        X[self.new_columns] = X[self.column].str.extract(r'(.*?)(\d+)$')
        
        if self.drop_columns:
            X.drop(self.drop_columns,axis=1,inplace=True)
        
        if self.modify_column and self.modify_function:
            X[self.modify_column] = X[self.modify_column].apply(self.modify_function)
        
        return X

In [None]:
# Create preprocessors
column_dropper = ColumnDropper(
    columns_to_drop = ["PassengerId","Name","Cabin"]
)

column_splitter = ColumnSplitter(
    column = "Ticket",
    new_columns = ["Ticket_string","Ticket_num"],
    drop_columns = ["Ticket_string","Ticket"],
    modify_column = "Ticket_num",
    modify_function = lambda x:pd.to_numeric(x, errors="coerce") # Coerce any non-numeric to NaN
)
    

categorical_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("encoder",OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())
    ])

# Combine into ColumnTransformer

col_transformers = ColumnTransformer(
    transformers = [
        ("cat",categorical_transformer,cat_vars),
        ("num",numerical_transformer,cont_vars),
    ], remainder='passthrough'
)

preprocessor = Pipeline(steps = [
    ("drop",column_dropper),
    ("split",column_splitter),
    ("transformers",col_transformers)
])

In [None]:
# Fit and transform on the training set, then transform on the validation set
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [None]:
# Fit and transform on the training set, then transform on the validation set
X_train_processed_2 = pd.DataFrame(preprocessor.fit_transform(X_train)

### Model Building

#### Evaluation Method
We will print multiple scores, but will compare accuracy for model evaluation.

In [None]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1,
        },
        index=[0],
    )

    return df_perf

#### Mulitple Models

In [None]:
models = []  # Empty list to store all the models

# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))

results = []  # Empty list to store all model's CV scores
names = []  # Empty list to store name of the models


# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")

for name, model in models:
    scoring = "accuracy"
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    )  # Setting number of splits equal to 5
    cv_result = cross_val_score(
        estimator=model, X=X_train_processed, y=y_train, scoring=scoring, cv=kfold
    )
    results.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean() * 100))

print("\n" "Training Performance:" "\n")

for name, model in models:
    model.fit(X_train_processed, y_train)
    scores = accuracy_score(y_train, model.predict(X_train_processed)) * 100
    print("{}: {}".format(name, scores))

In [None]:
print("\n" "Validation Performance:" "\n")
for name, model in models:
    scores = accuracy_score(y_val, model.predict(X_val_processed)) * 100
    print("{}: {}".format(name, scores))

#### Hyperparameter Tuning

* XGBoost, Gradient Boost, and RandomForest are performing fairly well on validation dataset. We will select one bagging-based technique (RandomForest) and one boosting technique (XGBoost), which will be tuned further, to ensure better performance and avoid overfitting.
* Let's start with RandomizedSearchCV to arrive at an approximate hyperparameter space and then use GridSearchCV to find the right parameters for best model performance.

##### Let's tune Random forest using Randomized Search

In [None]:
rf = RandomForestClassifier(random_state=1)

# Define a hyperparameter space
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [None, 10, 20, 30, 40, 50],        # Tree depth
    'min_samples_split': [2, 5, 10, 20],            # Minimum samples to split
    'min_samples_leaf': [1, 2, 4, 8],               # Minimum samples at leaf
    'max_features': [0.3, 0.7, None],               # Number of features per split
    'bootstrap': [True, False],                     # Bootstrap sampling
    "min_impurity_decrease":[0.001, 0.002, 0.003],
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit the GridSearchCV
random_search.fit(X_train_processed, y_train)

# Get the best parameters from GridSearchCV
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)

##### Let's fine tune Random Forest using GridSearchCV

In [None]:
# Define a hyperparameter grid
params = {
    'n_estimators': [300, 400, 500],           # Number of trees
    'max_depth': [20, 30, 40],                 # Tree depth
    'min_samples_split': [5, 10, 15],            # Minimum samples to split
    'min_samples_leaf': [1, 2, 4],              # Minimum samples at leaf
    'max_features': [0.7, None],           # Number of features per split
    'bootstrap': [True, False],                 # Bootstrap sampling
    "min_impurity_decrease":[0.002, 0.003],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=params,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit the GridSearchCV
grid_search.fit(X_train_processed, y_train)

# Get the best parameters from GridSearchCV
print("Best Parameters from GridSearchCV:", grid_search.best_params_)

In [None]:
rf_tuned = grid_search.best_estimator_

# Fit the best algorithm to the data.
rf_tuned.fit(X_train_processed, y_train)

In [None]:
model_performance_classification_sklearn(rf_tuned,X_train_processed,y_train)

In [None]:
model_performance_classification_sklearn(rf_tuned,X_val_processed,y_val)

- We see some gap between training and validation set performance with these parameters.
- Let's update some of those to reduce overfitting.

In [None]:
# Update parameters to reduce overfitting
rf_tuned_2 = RandomForestClassifier(
    max_features=0.7,
    min_samples_leaf=4,
    n_estimators=200,
    random_state=1,
    max_depth=10,
    min_samples_split=10,
    bootstrap=True,
    min_impurity_decrease=0.002,
)

# Fit the best algorithm to the data.
rf_tuned_2.fit(X_train_processed, y_train)

In [None]:
model_performance_classification_sklearn(rf_tuned_2,X_train_processed,y_train)

In [None]:
model_performance_classification_sklearn(rf_tuned_2,X_val_processed,y_val)

##### Let's tune XGBoost using Randomized Search

In [None]:
# defining model
xgb = XGBClassifier(random_state=1, eval_metric="logloss")

# Parameter grid to pass in RandomSearchCV
param_dist = {
    "n_estimators": [50, 100, 150, 200, 250, 300],
    "scale_pos_weight": [0, 1, 2, 3],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "gamma": [0, 3, 5],
    "subsample": [0.5, 0.7, 0.9, 1.0],
    "colsample_bytree":[0.5,0.7,0.9,1],
    "colsample_bylevel":[0.5,0.7,0.9,1],
}

# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    n_jobs=-1,
    scoring="accuracy",
    cv=5,
    random_state=1,
)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_processed, y_train)

print(
    "Best parameters are {} with CV score={}:".format(
        randomized_cv.best_params_, randomized_cv.best_score_
    )
)

##### Let's fine tune XGBoost using GridSearchCV

In [None]:
# Define a hyperparameter grid
params = {
    "n_estimators": [150, 200, 250],
    "scale_pos_weight": [0, 1, 2],
    "learning_rate": [0.02, 0.05, 0.1],
    "gamma": [0, 0.5, 1],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bylevel":[0.5,0.7],
}

# Initialize GridSearchCV
grid_obj = GridSearchCV(
    estimator=xgb,
    param_grid=params,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit the GridSearchCV
grid_obj.fit(X_train_processed, y_train)

# Get the best parameters from GridSearchCV
print("Best Parameters from GridSearchCV:", grid_obj.best_params_)

In [None]:
# Set the clf to the best combination of parameters
xgb_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data.
xgb_tuned.fit(X_train_processed, y_train)

In [None]:
model_performance_classification_sklearn(xgb_tuned,X_train_processed,y_train)

In [None]:
model_performance_classification_sklearn(xgb_tuned,X_val_processed,y_val)

- We still see some gap between training and validation set performance with these parameters.
- Let's add regularization and reduce learning rate to reduce overfitting.

In [None]:
# Update parameters to reduce overfitting
xgb_tuned_2 = XGBClassifier(
    random_state = 1,
    n_estimators = 150,
    scale_pos_weight = 1,
    learning_rate = 0.03,
    gamma = 2,
    subsample = 0.9,
    colsample_bylevel = 0.5,
    reg_lambda = 2
)

# Fit the best algorithm to the data.
xgb_tuned_2.fit(X_train_processed, y_train)

In [None]:
model_performance_classification_sklearn(xgb_tuned_2,X_train_processed,y_train)

In [None]:
model_performance_classification_sklearn(xgb_tuned_2,X_val_processed,y_val)

##### Let's attach feature names to the original dataframe

In [None]:
# Start with the initial columns after the column_dropper
remaining_columns = X_train.drop(columns=["PassengerId", "Name", "Cabin"]).columns.tolist()

# Handle column_splitter transformation
# Add new columns and remove dropped ones
remaining_columns.remove("Ticket")  # Drop 'Ticket'
remaining_columns.append("Ticket_num")  # Add 'Ticket_num'

# Process categorical and numerical feature names

# Fetch feature names from ColumnTransformer
categorical_features = preprocessor.named_steps['transformers'].named_transformers_['cat'][
    'encoder'
].get_feature_names_out(cat_vars)  # Encoded feature names

numerical_features = cont_vars  # Numerical features remain unchanged after scaling

# Combine categorical and numerical features with passthrough columns
passthrough_features = [
    col for col in remaining_columns if col not in cat_vars + cont_vars
]
final_feature_names = list(categorical_features) + numerical_features + passthrough_features

In [None]:
importances = xgb_tuned_2.feature_importances_
indices = np.argsort(importances)
feature_names = list(final_feature_names)

plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Insights

- Tuned XGBoost has the best performance amongst the two tuned models.
- Passenger Sex, class and age are three most important features


## Test Data Prediction

In [None]:
test_data = pd.read_csv("test.csv")

In [None]:
X_test_preprocessed = preprocessor.transform(test_data)

In [None]:
predictions = xgb_tuned.predict(X_test_preprocessed)

In [None]:
test_df = pd.DataFrame(test_data["PassengerId"])
test_df["Survived"] = predictions

In [None]:
test_df

In [None]:
test_df["Survived"].value_counts(normalize=True)

In [None]:
test_df.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")