In [1]:
import pandas as pd
import numpy as np
import joblib
import re

In [2]:
# 1) Load raw Kaggle-style test file
test = pd.read_csv("test.csv")

In [3]:
sex_map = {'male': 0, 'female': 1}
test['Sex'] = test['Sex'].map(sex_map)

In [4]:
emb_map = {'S': 0, 'C': 1, 'Q': 2}
test['Embarked'] = test['Embarked'].map(emb_map).fillna(0).astype(int)

# FamilySize = SibSp + Parch + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# IsAlone = FamilySize == 1 (0/1 bool to match Cleaned_Data.csv which uses TRUE/FALSE)
test['IsAlone'] = (test['FamilySize'] == 1)

In [5]:
def extract_title(name):
    # Typical titles pattern "Lastname, Title. Firstname"
    m = re.search(r',\s*([^\.]+)\.', str(name))
    return m.group(1).strip() if m else 'Rare'

In [6]:
def map_title_to_buckets(title):
    title = title.strip()
    if title in ['Mr']:
        return 'Mr'
    if title in ['Mrs', 'Mme', 'Lady', 'Countess']:
        return 'Mrs'
    if title in ['Miss', 'Mlle']:
        return 'Miss'
    # Everything else -> Rare (Master, Dr, Rev, Col, Major, Dona, Sir, Capt, Jonkheer, etc.)
    return 'Rare'

In [7]:
titles = test['Name'].apply(extract_title).apply(map_title_to_buckets)

# Build the four boolean indicator columns
test['Title_Miss'] = (titles == 'Miss')
test['Title_Mr']   = (titles == 'Mr')
test['Title_Mrs']  = (titles == 'Mrs')
test['Title_Rare'] = (titles == 'Rare')

In [8]:
feature_cols = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
    'FamilySize', 'IsAlone', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare'
]

In [9]:
for col in ['Age', 'Fare', 'SibSp', 'Parch']:
    test[col] = test[col].fillna(test[col].median())

In [11]:
import sklearn, joblib
# Now load the model
model = joblib.load('titanic_model.pkl')

In [12]:
X_test_submit = test[feature_cols]
proba = model.predict_proba(X_test_submit)[:, 1]
pred  = (proba >= 0.5).astype(int)

In [13]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': pred
})

submission.to_csv('submission.csv', index=False)

In [14]:
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1


In [15]:
# Step 2: Predict using the trained model
proba = model.predict_proba(test[feature_cols])[:, 1]
pred = (proba >= 0.5).astype(int)

# Step 3: Combine predictions with relevant features
results = test[['PassengerId'] + feature_cols].copy()
results['Survived_Pred'] = pred
results['Survival_Prob'] = proba

# Step 4: View or save the enriched prediction output
print(results.head())  # or
results.to_csv('detailed_predictions.csv', index=False)

   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0          892       3    0  34.5      0      0   7.8292         2   
1          893       3    1  47.0      1      0   7.0000         0   
2          894       2    0  62.0      0      0   9.6875         2   
3          895       3    0  27.0      0      0   8.6625         0   
4          896       3    1  22.0      1      1  12.2875         0   

   FamilySize  IsAlone  Title_Miss  Title_Mr  Title_Mrs  Title_Rare  \
0           1     True       False      True      False       False   
1           2    False       False     False       True       False   
2           1     True       False      True      False       False   
3           1     True       False      True      False       False   
4           3    False       False     False       True       False   

   Survived_Pred  Survival_Prob  
0              0       0.112779  
1              1       0.522392  
2              0       0.113573  
3              0

In [18]:
# Full feature set used during training
full_feature_cols = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
    'FamilySize', 'IsAlone', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare'
]

# Ensure all columns are present and filled
for col in ['Age', 'Fare', 'SibSp', 'Parch']:
    test[col] = test[col].fillna(test[col].median())
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).fillna(0).astype(int)

# Predict using full feature set
X_test_full = test[full_feature_cols]
proba = model.predict_proba(X_test_full)[:, 1]
pred = (proba >= 0.5).astype(int)

# Then analyze only the subset you're interested in
results = test[['PassengerId', 'Sex', 'Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone']].copy()
results['Survived_Pred'] = pred
results['Survival_Prob'] = proba
print(results.head())


   PassengerId  Sex  Pclass   Age     Fare  FamilySize  IsAlone  \
0          892    0       3  34.5   7.8292           1     True   
1          893    1       3  47.0   7.0000           2    False   
2          894    0       2  62.0   9.6875           1     True   
3          895    0       3  27.0   8.6625           1     True   
4          896    1       3  22.0  12.2875           3    False   

   Survived_Pred  Survival_Prob  
0              0       0.111829  
1              1       0.522392  
2              0       0.103232  
3              0       0.106277  
4              1       0.594651  
