In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Set data path
data_path = './data/raw/'

# Load the data
train_data = pd.read_csv(data_path + 'train.csv')
test_data = pd.read_csv(data_path + 'test.csv')

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Train data shape: (891, 12)
Test data shape: (418, 11)


In [3]:
# Basic EDA
plt.figure(figsize=(10, 6))
sns.countplot(x='Survived', data=train_data)
plt.title('Survival Distribution')
plt.savefig('./eda/survival_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='Sex', hue='Survived', data=train_data)
plt.title('Survival by Sex')
plt.savefig('./eda/survival_by_sex.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='Pclass', hue='Survived', data=train_data)
plt.title('Survival by Passenger Class')
plt.savefig('./eda/survival_by_class.png')
plt.close()

In [4]:
# Feature Engineering
def preprocess_data(df):
    # Create a copy to avoid modifying the original dataframe
    data = df.copy()
    
    # Extract titles from names
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+).', expand=False)
    
    # Group rare titles
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                   'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    data.loc[data['Title'].isin(rare_titles), 'Title'] = 'Rare'
    data.loc[data['Title'] == 'Mlle', 'Title'] = 'Miss'
    data.loc[data['Title'] == 'Ms', 'Title'] = 'Miss'
    data.loc[data['Title'] == 'Mme', 'Title'] = 'Mrs'
    
    # Create family size feature
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    # Create IsAlone feature
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    
    # Drop columns that won't be used in modeling
    data = data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
    
    # Convert categorical features to dummies
    data = pd.get_dummies(data, columns=['Sex', 'Embarked', 'Title'], drop_first=True)
    
    return data

In [5]:
# Preprocess train data
train_processed = preprocess_data(train_data)

In [6]:
# Handle missing values for train data
# Age - fill with median
age_median = train_data.groupby(['Sex', 'Pclass'])['Age'].transform('median')
train_processed['Age'] = train_processed['Age'].fillna(age_median)

# Embarked - fill with most common
train_processed['Embarked_Q'] = train_processed['Embarked_Q'].fillna(0)
train_processed['Embarked_S'] = train_processed['Embarked_S'].fillna(0)

In [7]:
# Prepare X and y for training
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']

In [8]:
# Preprocess test data
test_processed = preprocess_data(test_data)

In [9]:
# Handle missing values for test data
# Age - fill with median
age_median_test = test_data.groupby(['Sex', 'Pclass'])['Age'].transform('median')
test_processed['Age'] = test_processed['Age'].fillna(age_median_test)

# Fare - fill with median by passenger class
fare_median = test_data.groupby('Pclass')['Fare'].transform('median')
test_processed['Fare'] = test_processed['Fare'].fillna(fare_median)

# Embarked - fill with most common
test_processed['Embarked_Q'] = test_processed['Embarked_Q'].fillna(0)
test_processed['Embarked_S'] = test_processed['Embarked_S'].fillna(0)

In [10]:
# Ensure train and test have same columns
missing_cols = set(X.columns) - set(test_processed.columns)
for col in missing_cols:
    test_processed[col] = 0
test_processed = test_processed[X.columns]

In [11]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    cv_scores = cross_val_score(model, X, y, cv=5)
    
    print(f"\n{name} Results:")
    print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
    print(f"Cross-Validation Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(classification_report(y_val, val_pred))


Random Forest Results:
Validation Accuracy: 0.8380
Cross-Validation Score: 0.8014 ± 0.0276
              precision    recall  f1-score   support

           0       0.87      0.86      0.86       105
           1       0.80      0.81      0.81        74

    accuracy                           0.84       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179


Gradient Boosting Results:
Validation Accuracy: 0.7989
Cross-Validation Score: 0.8283 ± 0.0209
              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179


Logistic Regression Results:
Validation Accuracy: 0.8212
Cross-Validation Score: 0.8271 ± 0.0245
              precision    recall  f1-score   support

     

In [13]:
# Choose the best model
best_model = GradientBoostingClassifier(random_state=42)
best_model.fit(X, y)

In [14]:
# Make predictions on test data
test_predictions = best_model.predict(test_processed)

In [15]:
# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions.astype(int)
})

# Save submission file
submission.to_csv('./submission/titanic_submission.csv', index=False)
print("\nSubmission file created successfully!")


Submission file created successfully!


In [16]:
# Feature importance
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importances')
    plt.tight_layout()
    plt.savefig('./model/feature_importance.png')
    plt.close()
    
    print("\nTop 10 important features:")
    print(feature_importance.head(10))


Top 10 important features:
         Feature  Importance
7       Sex_male    0.462577
0         Pclass    0.140652
4           Fare    0.135847
1            Age    0.074952
5     FamilySize    0.053045
14  Title_Master    0.047128
2          SibSp    0.034624
18      Title_Mr    0.015915
9     Embarked_S    0.012410
20  Title_Mulder    0.005160
