# Titanic Survival Prediction

A comprehensive machine learning solution for predicting Titanic passenger survival.

This notebook covers:
1. Data cleaning and preprocessing
2. Feature engineering
3. Exploratory data analysis (EDA)
4. Machine learning model building
5. Prediction and submission

Tools used: Python, Pandas, NumPy, Scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## Data Loading

Load the Titanic datasets: train.csv, test.csv, and gender_submission.csv.

In [None]:
def load_data():
    """Load and return the Titanic datasets."""
    print("Loading datasets...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    gender_submission = pd.read_csv('gender_submission.csv')
    
    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    print(f"Sample submission shape: {gender_submission.shape}")
    
    return train_df, test_df, gender_submission

# Load data
train_df, test_df, gender_submission = load_data()

## Data Cleaning and Preprocessing

Handle missing values, convert categorical variables, and prepare data for modeling.

In [None]:
def data_cleaning(train_df, test_df):
    """Clean and preprocess the data."""
    print("\n=== DATA CLEANING ===")
    
    # Create copies
    train_clean = train_df.copy()
    test_clean = test_df.copy()
    
    # Handle missing values - Age
    print("Filling missing Age values...")
    train_clean['Age'] = train_clean.groupby(['Pclass', 'Sex'])['Age'].transform(
        lambda x: x.fillna(x.median()))
    test_clean['Age'] = test_clean.groupby(['Pclass', 'Sex'])['Age'].transform(
        lambda x: x.fillna(x.median()))
    
    # Handle missing values - Embarked
    print("Filling missing Embarked values...")
    train_clean['Embarked'] = train_clean['Embarked'].fillna('S')
    
    # Handle missing values - Fare
    print("Filling missing Fare values...")
    test_clean['Fare'] = test_clean['Fare'].fillna(test_clean['Fare'].median())
    
    # Convert categorical variables
    print("Converting categorical variables...")
    train_clean['Sex'] = train_clean['Sex'].map({'male': 0, 'female': 1})
    test_clean['Sex'] = test_clean['Sex'].map({'male': 0, 'female': 1})
    
    # One-hot encode Embarked
    train_clean = pd.get_dummies(train_clean, columns=['Embarked'], prefix='Embarked')
    test_clean = pd.get_dummies(test_clean, columns=['Embarked'], prefix='Embarked')
    
    print("Data cleaning completed!")
    return train_clean, test_clean

# Clean data
train_clean, test_clean = data_cleaning(train_df, test_df)

## Feature Engineering

Create new features from existing data to improve model performance.

In [None]:
def feature_engineering(train_clean, test_clean):
    """Create new features from existing data."""
    print("\n=== FEATURE ENGINEERING ===")
    
    # Create family size feature
    print("Creating family size feature...")
    train_clean['FamilySize'] = train_clean['SibSp'] + train_clean['Parch'] + 1
    test_clean['FamilySize'] = test_clean['SibSp'] + test_clean['Parch'] + 1
    
    # Create is alone feature
    print("Creating is alone feature...")
    train_clean['IsAlone'] = (train_clean['FamilySize'] == 1).astype(int)
    test_clean['IsAlone'] = (test_clean['FamilySize'] == 1).astype(int)
    
    # Extract titles from names
    print("Extracting titles from names...")
    def extract_title(name):
        return name.split(', ')[1].split('.')[0]
    
    train_clean['Title'] = train_clean['Name'].apply(extract_title)
    test_clean['Title'] = test_clean['Name'].apply(extract_title)
    
    # Group rare titles
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
        'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
        'Capt': 'Rare', 'Sir': 'Rare'
    }
    
    train_clean['Title'] = train_clean['Title'].map(title_mapping)
    test_clean['Title'] = test_clean['Title'].map(title_mapping)
    
    # One-hot encode titles
    train_clean = pd.get_dummies(train_clean, columns=['Title'], prefix='Title')
    test_clean = pd.get_dummies(test_clean, columns=['Title'], prefix='Title')
    
    # Create age bins
    print("Creating age bins...")
    train_clean['AgeBin'] = pd.cut(train_clean['Age'], bins=[0, 12, 18, 35, 60, 100], 
                                  labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    test_clean['AgeBin'] = pd.cut(test_clean['Age'], bins=[0, 12, 18, 35, 60, 100], 
                                 labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    
    # One-hot encode age bins
    train_clean = pd.get_dummies(train_clean, columns=['AgeBin'], prefix='Age')
    test_clean = pd.get_dummies(test_clean, columns=['AgeBin'], prefix='Age')
    
    print("Feature engineering completed!")
    return train_clean, test_clean

# Feature engineering
train_clean, test_clean = feature_engineering(train_clean, test_clean)

## Exploratory Data Analysis (EDA)

Analyze the data and create visualizations to understand patterns and relationships.

In [None]:
def exploratory_data_analysis(train_clean, train_df):
    """Perform exploratory data analysis and create visualizations."""
    print("\n=== EXPLORATORY DATA ANALYSIS ===")
    
    # Basic survival statistics
    survival_rate = train_df['Survived'].mean()
    print(f"Overall survival rate: {survival_rate:.2%}")
    
    # Survival by gender
    gender_survival = train_df.groupby('Sex')['Survived'].mean()
    print(f"Female survival rate: {gender_survival['female']:.2%}")
    print(f"Male survival rate: {gender_survival['male']:.2%}")
    
    # Survival by passenger class
    class_survival = train_df.groupby('Pclass')['Survived'].mean()
    print("\nSurvival by class:")
    for pclass, rate in class_survival.items():
        print(f"Class {pclass}: {rate:.2%}")
    
    # Create visualizations
    print("\nCreating visualizations...")
    
    # Survival by gender plot
    plt.figure(figsize=(8, 5))
    sns.barplot(x='Sex', y='Survived', data=train_df)
    plt.title('Survival Rate by Gender')
    plt.xticks([0, 1], ['Male', 'Female'])
    plt.ylabel('Survival Rate')
    plt.savefig('survival_by_gender.png')
    plt.show()
    
    # Survival by class plot
    plt.figure(figsize=(8, 5))
    sns.barplot(x='Pclass', y='Survived', data=train_df)
    plt.title('Survival Rate by Passenger Class')
    plt.ylabel('Survival Rate')
    plt.savefig('survival_by_class.png')
    plt.show()
    
    # Age distribution by survival
    plt.figure(figsize=(10, 6))
    sns.histplot(data=train_df, x='Age', hue='Survived', kde=True, bins=30)
    plt.title('Age Distribution by Survival')
    plt.savefig('age_distribution.png')
    plt.show()
    
    print("EDA completed! Visualizations saved as PNG files.")

# EDA
exploratory_data_analysis(train_clean, train_df)

## Model Building

Build and evaluate machine learning models for survival prediction.

In [None]:
def build_models(train_clean):
    """Build and evaluate machine learning models."""
    print("\n=== MODEL BUILDING ===")
    
    # Prepare features
    features_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']
    X = train_clean.drop(features_to_drop, axis=1)
    y = train_clean['Survived']
    
    print(f"Training features shape: {X.shape}")
    
    # Split data for validation
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
    }
    
    # Train and evaluate models
    results = {}
    print("\nModel Performance:")
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        results[name] = accuracy
        
        # Cross-validation
        cv_scores = cross_val_score(model, X, y, cv=5)
        
        print(f"{name}:")
        print(f"  Validation Accuracy: {accuracy:.4f}")
        print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Return best model and features
    best_model_name = max(results, key=results.get)
    best_model = models[best_model_name]
    best_model.fit(X, y)  # Train on full data
    
    print(f"\nBest model: {best_model_name}")
    return best_model, X

# Build models
best_model, X = build_models(train_clean)

## Making Predictions

Use the best model to make predictions on the test data and create a submission file.

In [None]:
def make_predictions(model, train_clean, test_clean):
    """Make predictions on test data and create submission."""
    print("\n=== MAKING PREDICTIONS ===")
    
    # Prepare test features (same as training features)
    features_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']
    X_train = train_clean.drop(features_to_drop, axis=1)
    X_test = test_clean.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Ensure same columns in test data
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0
    X_test = X_test[X_train.columns]
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Create submission file
    submission = pd.DataFrame({
        'PassengerId': test_clean['PassengerId'],
        'Survived': predictions
    })
    
    # Save submission
    submission.to_csv('titanic_submission.csv', index=False)
    
    print(f"Predictions completed!")
    print(f"Number of predicted survivors: {submission['Survived'].sum()}/{len(submission)}")
    print("Submission file saved as 'titanic_submission.csv'")
    
    return submission

# Make predictions
submission = make_predictions(best_model, train_clean, test_clean)

## Conclusion

The analysis is complete! The notebook has:
- Loaded and cleaned the data
- Engineered new features
- Performed exploratory data analysis with visualizations
- Built and evaluated machine learning models
- Made predictions and created a submission file

Files created:
- titanic_submission.csv (predictions)
- survival_by_gender.png (visualization)
- survival_by_class.png (visualization)
- age_distribution.png (visualization)