In [140]:
# 1. Setup and Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV


# custom classes
from feature_engineering import FeatureEngineer


import warnings
warnings.filterwarnings("ignore")

In [118]:
# 2. Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('gender_submission.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (891, 12)
Test shape: (418, 11)


In [119]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [120]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [121]:
X = train.drop(columns='Survived')
y = train['Survived']
X_test = test.copy()

In [122]:
# 4. Feature Lists
num_features = ['Age', 'Fare', 'FamilySize']
cat_features = ['Sex', 'Embarked', 'Pclass', 'Title', 'IsAlone']

In [152]:
# 5. Preprocessor
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) #scaler recommended from hp tuning
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

In [153]:
# 6.0. Pipeline
pipeline = Pipeline([
    ('features', FeatureEngineer()),
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

In [154]:
# Performing hyperparameter tuning using RandomizedSearchCV
param_dist = {
    # 'model': [rf, lr, gb],  # List of models to be tested
    'model__n_estimators': [100, 125, 150, 175, 200],
    'model__max_depth': [3, 5, 7, 9, 11, 13, None],
    # 'model__learning_rate': [0.01, 0.1, 0.2],  # Only for models like XGBoost and GradientBoosting
    # 'model__C': [0.1, 1, 10],  # Only for Logistic Regression (penalty parameter)
    # 'model__subsample': [0.8, 1.0],  # Only for GradientBoosting and XGBoost
    'preprocess__num__scaler': [StandardScaler(), MinMaxScaler()],  # Try different scalers
}
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=10, cv=5, verbose=1, random_state=42)
random_search.fit(X, y)

# Output the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'preprocess__num__scaler': StandardScaler(), 'model__n_estimators': 200, 'model__max_depth': 5}
Best Score: 0.8327537505492437


In [130]:
# 6.1. Full Pipeline after HP tuning
pipeline = Pipeline([
    ('features', FeatureEngineer()),
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(random_state=42, n_estimators=200, max_depth=5))
])

In [156]:
# 7. Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

# Print individual fold scores
print("Fold scores:", np.round(cv_scores, 4))

# Print summary
print("Mean Accuracy: {:.4f}".format(cv_scores.mean()))
print("Standard Deviation: {:.4f}".format(cv_scores.std()))
print("Accuracy Range: {:.4f} - {:.4f}".format(cv_scores.min(), cv_scores.max()))


Fold scores: [0.8156 0.7865 0.8034 0.8315 0.8258]
Mean Accuracy: 0.8126
Standard Deviation: 0.0162
Accuracy Range: 0.7865 - 0.8315


In [158]:
# 8. Fit and Predict
pipeline.fit(X, y)
predictions = pipeline.predict(X_test)

In [159]:
# 9. Submission
submission = pd.DataFrame({
    "PassengerId": X_test["PassengerId"],
    "Survived": predictions.astype(int)
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved.")

✅ submission.csv saved.
