In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [33]:
# Load Titanic dataset directly from URL (no CSV needed)
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print("Dataset loaded successfully!")
print("Shape:", df.shape)
print(df.head(5))


Dataset loaded successfully!
Shape: (891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0 

In [41]:
# Map 'Sex' to numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Fill missing values (safe assignment, avoids FutureWarning)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Check missing values
print("Missing values after preprocessing:")
print(df.isnull().sum())

# Show sample data
print(df.head())


Missing values after preprocessing:
PassengerId            0
Survived               0
Pclass                 0
Name                   0
Sex                  891
Age                    0
SibSp                  0
Parch                  0
Ticket                 0
Fare                   0
Cabin                687
Embarked               0
PredictedSurvived      0
dtype: int64
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris  NaN  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  NaN  38.0      1      0   
2                             Heikkinen, Miss. Laina  NaN  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  NaN  35.0      1      0   
4   

In [35]:
target = 'Survived'
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

X = df[features]
y = df[target]

# One-hot encode 'Embarked'
X = pd.get_dummies(X, columns=['Embarked'], drop_first=True)

print("Features and target prepared!")
print("Feature sample:")
print(X.head(5))


Features and target prepared!
Feature sample:
   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked_Q  Embarked_S
0       3    0  22.0      1      0   7.2500       False        True
1       1    1  38.0      1      0  71.2833       False       False
2       3    1  26.0      0      0   7.9250       False        True
3       1    1  35.0      1      0  53.1000       False        True
4       3    0  35.0      0      0   8.0500       False        True


In [36]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train-Test split done!")
print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])


Train-Test split done!
Training samples: 712
Validation samples: 179


In [37]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

print("Model trained successfully ✅")


Model trained successfully ✅


In [38]:
val_preds = model.predict(X_val)
accuracy = accuracy_score(y_val, val_preds)

print("Validation Accuracy:", round(accuracy*100, 2), "%")


Validation Accuracy: 81.01 %


In [39]:
full_preds = model.predict(X)
df['PredictedSurvived'] = full_preds

print("Predictions on full dataset added!")
print(df[['PassengerId', 'Survived', 'PredictedSurvived']].head(10))


Predictions on full dataset added!
   PassengerId  Survived  PredictedSurvived
0            1         0                  0
1            2         1                  1
2            3         1                  1
3            4         1                  1
4            5         0                  0
5            6         0                  0
6            7         0                  0
7            8         0                  0
8            9         1                  1
9           10         1                  1
