# Titanic Survival Prediction
## Decision Tree vs Random Forest Comparison


## Data analysis and wrangling

In [29]:
import pandas as pd
import numpy as np
import random as rnd
!apt-get install graphviz -y
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder

Reading package lists... Done
Building dependency tree       
Reading state information... Done
graphviz is already the newest version (2.38.0-17).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


## Acquire data

In [30]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
combine = [train_df, test_df]

## Analyze by describing data

In [31]:
print(train_df.columns.values)
numeric_columns = train_df.select_dtypes(include='number').columns
print(numeric_columns.tolist())
non_numeric_columns = train_df.select_dtypes(exclude='number').columns
print(non_numeric_columns.tolist())

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [32]:
# preview the data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
print('Missing values:')
print(train_df.isnull().sum())

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Preprocessing and Wrangling

In [34]:
# Create copies
train_processed = train_df.copy()
test_processed = test_df.copy()

# Handle missing values in Age
train_processed['Age'].fillna(train_processed['Age'].median(), inplace=True)
test_processed['Age'].fillna(test_processed['Age'].median(), inplace=True)

# Handle missing values in Embarked
train_processed['Embarked'].fillna(train_processed['Embarked'].mode()[0], inplace=True)
test_processed['Embarked'].fillna(test_processed['Embarked'].mode()[0], inplace=True)

# Handle missing values in Fare
train_processed['Fare'].fillna(train_processed['Fare'].median(), inplace=True)
test_processed['Fare'].fillna(test_processed['Fare'].median(), inplace=True)

print('After handling missing values:')
print(train_processed.isnull().sum())

After handling missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


## Feature Engineering

In [35]:
print('\nFEATURE ENGINEERING:')


# 1. Extract Title from Name
for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Replace rare titles
for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 
                                                 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Map titles to numeric
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_processed['Title'] = train_df['Title'].astype(int)
test_processed['Title'] = test_df['Title'].astype(int)
print('Title feature created')


FEATURE ENGINEERING:
Title feature created


In [36]:
# 2. FamilySize
train_processed['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_processed['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
print('FamilySize feature created')

# 3. IsAlone
train_processed['IsAlone'] = (train_processed['FamilySize'] == 1).astype(int)
test_processed['IsAlone'] = (test_processed['FamilySize'] == 1).astype(int)
print('IsAlone feature created')

# 4. Age*Class interaction (UNIQUE FEATURE)
train_processed['Age*Class'] = train_processed['Age'] * train_processed['Pclass']
test_processed['Age*Class'] = test_processed['Age'] * test_processed['Pclass']
print('Age*Class interaction feature created')

FamilySize feature created
IsAlone feature created
Age*Class interaction feature created


In [37]:
# 5. CabinDeck (UNIQUE FEATURE)
train_processed['CabinDeck'] = train_df['Cabin'].str[0].fillna('U')
test_processed['CabinDeck'] = test_df['Cabin'].str[0].fillna('U')

le_cabindeck = LabelEncoder()
train_processed['CabinDeck'] = le_cabindeck.fit_transform(train_processed['CabinDeck'])
test_processed['CabinDeck'] = le_cabindeck.transform(test_processed['CabinDeck'])
print('CabinDeck feature created')

# 6. HasCabin (UNIQUE FEATURE)
train_processed['HasCabin'] = (~train_df['Cabin'].isnull()).astype(int)
test_processed['HasCabin'] = (~test_df['Cabin'].isnull()).astype(int)
print('HasCabin feature created')

print('\nFinal features after engineering:')
print(train_processed.columns.tolist())

CabinDeck feature created
HasCabin feature created

Final features after engineering:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Age*Class', 'CabinDeck', 'HasCabin']


## Drop Unnecessary Columns

In [38]:
# Drop unnecessary columns
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
train_processed.drop(columns_to_drop, axis=1, inplace=True)
test_processed.drop([col for col in columns_to_drop if col in test_processed.columns], axis=1, inplace=True)

print('Columns after dropping:')
print(train_processed.columns.tolist())

Columns after dropping:
['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Age*Class', 'CabinDeck', 'HasCabin']


## Encode Categorical Variables

In [39]:
# Encode Sex
le_sex = LabelEncoder()
train_processed['Sex'] = le_sex.fit_transform(train_processed['Sex'])
test_processed['Sex'] = le_sex.transform(test_processed['Sex'])

# Encode Embarked
le_embarked = LabelEncoder()
train_processed['Embarked'] = le_embarked.fit_transform(train_processed['Embarked'])
test_processed['Embarked'] = le_embarked.transform(test_processed['Embarked'])

print('After encoding:')
print(train_processed.head())

After encoding:
   Survived  Pclass  Sex    ...     Age*Class  CabinDeck  HasCabin
0         0       3    1    ...          66.0          8         0
1         1       1    0    ...          38.0          2         1
2         1       3    0    ...          78.0          8         0
3         1       1    0    ...          35.0          2         1
4         0       3    1    ...         105.0          8         0

[5 rows x 12 columns]


## Prepare X and y

In [40]:
X_train = train_processed.drop('Survived', axis=1)
Y_train = train_processed['Survived']
X_test = test_processed.copy()

print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print('X_test shape:', X_test.shape)
print('\nFeatures used:')
print(X_train.columns.tolist())

X_train shape: (891, 11)
Y_train shape: (891,)
X_test shape: (418, 11)

Features used:
['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Age*Class', 'CabinDeck', 'HasCabin']


## Decision Tree

In [41]:
# Train Decision Tree
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=10, 
                                       min_samples_leaf=5, random_state=42)
decision_tree.fit(X_train, Y_train)

print('Decision Tree Training Accuracy:', round(decision_tree.score(X_train, Y_train), 4))

Decision Tree Training Accuracy: 0.8507


In [42]:
# 5-fold cross-validation for Decision Tree
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
dt_cv_scores = cross_val_score(decision_tree, X_train, Y_train, cv=kfold, scoring='accuracy')

print('Decision Tree - 5-Fold Cross-Validation Scores:')
print('Fold scores:', dt_cv_scores)
print('Average accuracy:', round(dt_cv_scores.mean(), 4))
print('Standard deviation:', round(dt_cv_scores.std(), 4))

dt_accuracy = dt_cv_scores.mean()

Decision Tree - 5-Fold Cross-Validation Scores:
Fold scores: [0.81564246 0.82022472 0.80898876 0.76966292 0.83707865]
Average accuracy: 0.8103
Standard deviation: 0.0223


## Random Forest

In [43]:
# Train Random Forest
random_forest = RandomForestClassifier(n_estimators=100, max_depth=5,
                                       min_samples_split=10, min_samples_leaf=5,
                                       random_state=42)
random_forest.fit(X_train, Y_train)

print('Random Forest Training Accuracy:', round(random_forest.score(X_train, Y_train), 4))

Random Forest Training Accuracy: 0.853


In [44]:
# 5-fold cross-validation for Random Forest
rf_cv_scores = cross_val_score(random_forest, X_train, Y_train, cv=kfold, scoring='accuracy')

print('Random Forest - 5-Fold Cross-Validation Scores:')
print('Fold scores:', rf_cv_scores)
print('Average accuracy:', round(rf_cv_scores.mean(), 4))
print('Standard deviation:', round(rf_cv_scores.std(), 4))

rf_accuracy = rf_cv_scores.mean()

Random Forest - 5-Fold Cross-Validation Scores:
Fold scores: [0.82122905 0.82022472 0.85955056 0.81460674 0.83707865]
Average accuracy: 0.8305
Standard deviation: 0.0163


## Comparison and Analysis

In [45]:

print('MODEL COMPARISON')

print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')
print(f'Difference: {abs(rf_accuracy - dt_accuracy):.4f}')

if rf_accuracy > dt_accuracy:
    print(f'\nRandom Forest is better by {(rf_accuracy - dt_accuracy)*100:.2f}%')
else:
    print(f'\nDecision Tree is better by {(dt_accuracy - rf_accuracy)*100:.2f}%')

MODEL COMPARISON
Decision Tree Accuracy: 0.8103
Random Forest Accuracy: 0.8305
Difference: 0.0202

Random Forest is better by 2.02%


In [46]:
# Feature Importance
print('\nFeature Importance (Random Forest):')
feature_imp = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': random_forest.feature_importances_
}).sort_values('Importance', ascending=False)

print(feature_imp)


Feature Importance (Random Forest):
       Feature  Importance
5        Title    0.278729
1          Sex    0.250102
3         Fare    0.091280
8    Age*Class    0.089562
0       Pclass    0.069672
6   FamilySize    0.061012
9    CabinDeck    0.050703
2          Age    0.048826
10    HasCabin    0.035952
4     Embarked    0.014709
7      IsAlone    0.009454


In [47]:

print('PREPROCESSING SUMMARY')

print('\n1. Missing Value Handling:')
print('   - Age: Filled with median')
print('   - Embarked: Filled with mode')
print('   - Fare: Filled with median')
print('\n2. Feature Engineering (6 new features):')
print('   - Title: Extracted from Name')
print('   - FamilySize: SibSp + Parch + 1')
print('   - IsAlone: Whether passenger traveled alone')
print('   - Age*Class: Interaction between Age and Pclass')
print('   - CabinDeck: First letter of cabin (deck level)')
print('   - HasCabin: Whether cabin information exists')
print('\n3. Categorical Encoding:')
print('   - Sex: Encoded to numeric')
print('   - Embarked: Encoded to numeric')
print('   - CabinDeck: Encoded to numeric')
print('\n4. Columns Dropped:')
print('   - PassengerId, Name, Ticket, Cabin, SibSp, Parch')
print('\n5. Final Features Used:')
print('   -', ', '.join(X_train.columns.tolist()))

PREPROCESSING SUMMARY

1. Missing Value Handling:
   - Age: Filled with median
   - Embarked: Filled with mode
   - Fare: Filled with median

2. Feature Engineering (6 new features):
   - Title: Extracted from Name
   - FamilySize: SibSp + Parch + 1
   - IsAlone: Whether passenger traveled alone
   - Age*Class: Interaction between Age and Pclass
   - CabinDeck: First letter of cabin (deck level)
   - HasCabin: Whether cabin information exists

3. Categorical Encoding:
   - Sex: Encoded to numeric
   - Embarked: Encoded to numeric
   - CabinDeck: Encoded to numeric

4. Columns Dropped:
   - PassengerId, Name, Ticket, Cabin, SibSp, Parch

5. Final Features Used:
   - Pclass, Sex, Age, Fare, Embarked, Title, FamilySize, IsAlone, Age*Class, CabinDeck, HasCabin


In [48]:

print('OBSERVATIONS AND CONCLUSIONS')


if rf_accuracy > dt_accuracy:
    print('\nRandom Forest performs better because:')
    print('1. Ensemble averaging reduces variance')
    print('2. Multiple trees reduce overfitting')
    print('3. Random feature selection increases diversity')
    print('4. Better generalization on unseen data')
else:
    print('\nDecision Tree performs better because:')
    print('1. Simpler model with less averaging loss')
    print('2. Data structure is well-captured by single tree')
    print('3. More interpretable results')
    print('4. Fewer hyperparameters to tune')

print('\nKey Insights:')
print('- Feature engineering improved model performance')
print('- Title, Age*Class, and FamilySize are important predictors')
print('- Cross-validation shows model stability')

OBSERVATIONS AND CONCLUSIONS

Random Forest performs better because:
1. Ensemble averaging reduces variance
2. Multiple trees reduce overfitting
3. Random feature selection increases diversity
4. Better generalization on unseen data

Key Insights:
- Feature engineering improved model performance
- Title, Age*Class, and FamilySize are important predictors
- Cross-validation shows model stability


In [49]:
# Install graphviz first
!apt-get install graphviz -y

# Then run this code
from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(
    decision_tree,
    out_file=None,
    feature_names=X_train.columns.tolist(),
    class_names=['Not Survived', 'Survived'],
    filled=True,
    rounded=True
)

graph = graphviz.Source(dot_data)
graph.render('decision_tree', cleanup=True)
print("Tree saved as decision_tree.png")

Reading package lists... Done
Building dependency tree       
Reading state information... Done
graphviz is already the newest version (2.38.0-17).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Tree saved as decision_tree.png
