## Imports

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

In [2]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

## Feature engineering

In [3]:
train_df.loc[train_df['Fare'] == 0, 'Fare'] = np.NaN
test_df.loc[test_df['Fare'] == 0, 'Fare'] = np.NaN

In [4]:
train_df['Title'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [5]:
train_df['Ticket_2letter'] = train_df['Ticket'].apply(lambda x: x[:2])
test_df['Ticket_2letter'] = test_df['Ticket'].apply(lambda x: x[:2])

In [6]:
train_df['Ticket_len'] = train_df['Ticket'].apply(lambda x: len(x))
test_df['Ticket_len'] = test_df['Ticket'].apply(lambda x: len(x))

In [7]:
train_df['Cabin_num'] = train_df['Ticket'].apply(lambda x: len(x.split()))
test_df['Cabin_num'] = test_df['Ticket'].apply(lambda x: len(x.split()))

In [8]:
train_df['Cabin_1letter'] = train_df['Ticket'].apply(lambda x: x[:1])
test_df['Cabin_1letter'] = test_df['Ticket'].apply(lambda x: x[:1])

In [9]:
train_df['Fam_size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Fam_size'] = test_df['SibSp'] + test_df['Parch'] + 1

## RF model training

In [10]:
y = train_df['Survived']
features = ['Pclass', 'Fare', 'Title', 'Embarked', 'Fam_size', 'Ticket_len', 'Ticket_2letter']
X = train_df[features]

In [11]:
numerical_cols = ['Fare']
categorical_cols = ['Pclass', 'Title', 'Embarked', 'Fam_size', 'Ticket_len', 'Ticket_2letter']

# Inputting numerical values with median
numerical_transformer = SimpleImputer(strategy='median')

# Inputting missing values with most frequent one for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Bundle preprocessing and modeling code
titanic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=0, n_estimators=500, max_depth=5))
])

# Training
titanic_pipeline.fit(X, y)

print('Cross validation score: {:.3f}'.format(cross_val_score(titanic_pipeline, X, y, cv=10).mean()))

Cross validation score: 0.818


In [12]:
X_test = test_df[features]

In [13]:
predictions = titanic_pipeline.predict(X_test)

In [14]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)

## XGBoost model training

In [15]:
# Bundle preprocessing and modeling code
titanic_pipeline_XGBoost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=0, n_estimators=500, max_depth=5, use_label_encoder=False, eval_metric='logloss'))
])

# Training
titanic_pipeline_XGBoost.fit(X, y)

print('Cross validation score: {:.3f}'.format(cross_val_score(titanic_pipeline_XGBoost, X, y, cv=10).mean()))

Cross validation score: 0.804


In [16]:
predictions = titanic_pipeline_XGBoost.predict(X_test)

In [17]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission_XGBoost.csv', index=False)