In [1]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold

# Step 1: Create synthetic Titanic dataset
np.random.seed(42)
n_samples = 1000

# Generating synthetic features
age = np.random.randint(18, 80, size=n_samples)  # Age
sex = np.random.choice(['male', 'female'], size=n_samples)  # Gender
pclass = np.random.choice([1, 2, 3], size=n_samples)  # Passenger class
sibsp = np.random.randint(0, 10, size=n_samples)  # Number of siblings/spouses aboard
parch = np.random.randint(0, 10, size=n_samples)  # Number of parents/children aboard
fare = np.random.uniform(5, 500, size=n_samples)  # Fare
embarked = np.random.choice(['C', 'Q', 'S'], size=n_samples)  # Embarked location

# Generate a synthetic target variable (Survived)
survived = np.random.choice([0, 1], size=n_samples)  # Target variable

# Create a DataFrame
df = pd.DataFrame({
    'Age': age,
        'Sex': sex,
            'Pclass': pclass,
                'SibSp': sibsp,
                    'Parch': parch,
                        'Fare': fare,
                            'Embarked': embarked,
                                'Survived': survived
                                })

# Step 2: Handle missing values and create features
X = df.drop(columns=['Survived'])
y = df['Survived']

# Step 3: Create preprocessing pipeline
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Sex', 'Pclass', 'Embarked']

# Numeric transformations (Imputation + Scaling)
numeric_transformer = Pipeline(steps=[
          ('imputer', SimpleImputer(strategy='mean')),
           ('scaler', StandardScaler())
])

# Categorical transformations (Imputation + One-Hot Encoding)
categorical_transformer = Pipeline(steps=[
                                            ('imputer', SimpleImputer(strategy='most_frequent')),
                                                ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                                ])

# Combine both transformations into one preprocessor
preprocessor = ColumnTransformer(
               transformers=[
               ('num', numeric_transformer, numeric_features),
               ('cat', categorical_transformer, categorical_features)
               ]
)

# Step 4: Create a Random Forest pipeline with GridSearchCV for hyperparameter tuning
pipeline = Pipeline(steps=[
           ('preprocessor', preprocessor),
           ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for Random Forest hyperparameters
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
}

# Apply GridSearchCV with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=5), n_jobs=-1)
grid_search.fit(X, y)

# Step 5: Evaluate using cross-validation
best_model = grid_search.best_estimator_
cross_val_score(best_model, X, y, cv=5, scoring='accuracy').mean()

# Print the best hyperparameters and cross-validation result
print("Best Hyperparameters: ", grid_search.best_params_)
print("Cross-validation Accuracy: ", cross_val_score(best_model, X, y, cv=5, scoring='accuracy').mean())


Best Hyperparameters:  {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Cross-validation Accuracy:  0.521
