# Titanic round 2
focusing on solve my overfitting problem

In [None]:
import pandas as pd
import numpy as np
import re
import os

# reduce encoding pitfalls
os.environ["PYTHONIOENCODING"] = "utf-8"

# Load data
df = pd.read_csv("data/train.csv")
t_df = pd.read_csv("data/test.csv")

## feature engineering
* age_is_missing
* Deck: we take them from ticket, but because of missing and rare information, we collapse to Unknown and rare
* title: to avoid interfere to sex, we collapse the rare ones.

In [18]:
# --- Helper: Extract Title from Name ---
def extract_title(name):
    m = re.search(r",\s*([^\.]+)\.", str(name))
    return m.group(1).strip() if m else "None"

for tar in (df, t_df):
    # Age missing flag
    tar['Age_is_missing'] = tar['Age'].isna().astype(int)

    # Fill Age with median
    tar['Age'] = tar['Age'].fillna(tar['Age'].median())

    # Deck from Cabin (first letter), collapse invalid to "U"
    tar['Deck'] = tar['Cabin'].astype(str).str[0]
    tar['Deck'] = tar['Deck'].apply(lambda x: x if x in list("ABCDEFG") else "U")

    # Extract Title from Name
    tar['Title'] = tar['Name'].apply(extract_title)

    # Collapse rare titles
    common_titles = ['Mr','Miss','Mrs','Master']
    tar['Title'] = tar['Title'].apply(lambda x: x if x in common_titles else 'Other')

    # Fill Embarked missing with most common ('S')
    tar['Embarked'] = tar['Embarked'].fillna('S')

    # Fill Fare (test set has one missing)
    tar['Fare'] = tar['Fare'].fillna(tar['Fare'].median())

# Define features + target
y = df['Survived'].astype(int)
feature_cols = [
    'Pclass','Sex','Age','SibSp','Parch','Fare',
    'Embarked','Deck','Title','Age_is_missing'
]

X = df[feature_cols].copy()
X_test_final = t_df[feature_cols].copy()


## Preprocessing & Pipelines

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Define numeric and categorical groups
numeric_features = ['Age','SibSp','Parch']
categorical_features = ['Sex','Age_is_missing','Pclass']

# Numeric pipeline for Logistic Regression (needs scaling)
numeric_transformer_lr = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Numeric pipeline for Random Forest (no scaling needed)
numeric_transformer_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Categorical pipeline (shared by both)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessors
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_lr, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_rf, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Pipelines
rf_clf = Pipeline(steps=[
    ('preprocess', preprocessor_rf),
    ('model', RandomForestClassifier(
        n_estimators=300,
        max_depth=5,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ))
])

lr_clf = Pipeline(steps=[
    ('preprocess', preprocessor_lr),
    ('model', LogisticRegression(
        max_iter=2000,
        C=1.0,
        solver='lbfgs'
    ))
])


## Model training & parameter optimization
### Random Forest

In [20]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from joblib import parallel_backend

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {
    'model__max_depth': [3, 5, 7, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 3, 5],
    'model__n_estimators': [200, 300, 500]
}

with parallel_backend('threading'):  # <— use threads instead of processes
    grid = GridSearchCV(
        rf_clf,
        param_grid=param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,   # threads, not processes
        verbose=0
    )
    grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)


Best parameters: {'model__max_depth': 7, 'model__min_samples_leaf': 3, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best CV accuracy: 0.8281897627965045


## Logistic Regression

In [21]:
# Rebuild best RF pipeline with tuned parameters
best_rf = Pipeline(steps=[
    ('preprocess', preprocessor_rf),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=7,
        min_samples_split=2,
        min_samples_leaf=3,
        random_state=42,
        n_jobs=-1
    ))
])

# Train on full training data
best_rf.fit(X, y)

# Predict on test set
pred_test = best_rf.predict(X_test_final)

# Build submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': t_df['PassengerId'],
    'Survived': pred_test.astype(int)
})

# Save CSV
submission_df.to_csv("submission_rf_tuned_simp.csv", index=False)
print("Submission file saved as submission_rf_tuned.csv")


Submission file saved as submission_rf_tuned.csv
