In [12]:
!pip install xgboost



In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [14]:
#import libraries
import pandas as pd
import numpy as np

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
#load data and preprocessing

In [17]:
train = pd.read_csv('../../12_data/titanic/train.csv')
test = pd.read_csv('../../12_data/titanic/test.csv')

In [18]:
#preprocessing

In [19]:
def preprocess(df):
    # Fill missing values in 'Age' column with the median age
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # Fill missing values in 'Embarked' column with the most common value 'S'
    df['Embarked'] = df['Embarked'].fillna('S')

    # Fill missing values in 'Fare' column with the median fare
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # Encode 'Sex' column (e.g., male -> 1, female -> 0)
    df['Sex'] = LabelEncoder().fit_transform(df['Sex'])

    # Encode 'Embarked' column (e.g., S, C, Q -> 0, 1, 2)
    df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

    # Return selected features for modeling
    return df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]

#preprocessing
X = preprocess(train)
y = train['Survived']

X_test = preprocess(test)

In [20]:
#ensemble model training

In [21]:
'''
log_clf, rf_clf, and xgb_clf are three different classifiers: Logistic Regression, Random Forest, and XGBoost respectively.
These classifiers are combined into a single ensemble model called VotingClassifier.
The ensemble uses soft voting, meaning it averages the predicted probabilities from each classifier to make the final prediction.
Finally, voting_clf.fit(X, y) trains the ensemble model on the training data features X and labels y.
'''

'\nlog_clf, rf_clf, and xgb_clf are three different classifiers: Logistic Regression, Random Forest, and XGBoost respectively.\nThese classifiers are combined into a single ensemble model called VotingClassifier.\nThe ensemble uses soft voting, meaning it averages the predicted probabilities from each classifier to make the final prediction.\nFinally, voting_clf.fit(X, y) trains the ensemble model on the training data features X and labels y.\n'

In [22]:
# Initialize individual classifiers
log_clf = LogisticRegression(max_iter=1000)  # The logistic regression model will try up to 1000 iterations to find the optimal solution.
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42) #The model builds 100 decision trees and combines their results by voting or averaging.
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Create a VotingClassifier ensemble using soft voting
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rf_clf),
        ('xgb', xgb_clf)
    ],
    voting='soft'  # # Use soft voting: average predicted probabilities
)


# Train the ensemble model on training data
voting_clf.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
#predictions

In [24]:
#Uses the trained voting classifier to predict the target variable (Survived) on the test dataset features X_test.
preds = voting_clf.predict(X_test)


#Creates a new DataFrame called submission containing two columns: the PassengerId from the test set (to identify each passenger) and the predicted survival outcomes preds
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': preds
})
#Saves the submission DataFrame to a CSV file named 'voting_submission.csv'
submission.to_csv('../../12_data/titanic/gender_submission.csv', index=False)



In [25]:
#Feature Engineering

In [26]:
# Bin Age into categories
bins = [0, 12, 20, 40, 60, 80] #defines the age intervals (bins) to split the continuous age values into categories: 0–12, 12–20, 20–40, 40–60, 60–80. 
labels = ['Child', 'Teen', 'Adult', 'MidAge', 'Senior']
train['AgeGroup'] = pd.cut(train['Age'], bins=bins, labels=labels)
test['AgeGroup'] = pd.cut(test['Age'], bins=bins, labels=labels)

In [27]:
# Check the distribution of AgeGroup in train data
print(train['AgeGroup'].value_counts())

AgeGroup
Adult     562
MidAge    128
Teen      110
Child      69
Senior     22
Name: count, dtype: int64


In [28]:
# Show first 10 rows to verify AgeGroup column
print(train[['Age', 'AgeGroup']].head(10))

    Age AgeGroup
0  22.0    Adult
1  38.0    Adult
2  26.0    Adult
3  35.0    Adult
4  35.0    Adult
5  28.0    Adult
6  54.0   MidAge
7   2.0    Child
8  27.0    Adult
9  14.0     Teen


In [29]:
# Check the distribution of AgeGroup in test data
print(test['AgeGroup'].value_counts())

AgeGroup
Adult     272
MidAge     66
Teen       44
Child      25
Senior     11
Name: count, dtype: int64


In [30]:
# Show first 10 rows in test data for AgeGroup verification
print(test[['Age', 'AgeGroup']].head(10))

    Age AgeGroup
0  34.5    Adult
1  47.0   MidAge
2  62.0   Senior
3  27.0    Adult
4  22.0    Adult
5  14.0     Teen
6  30.0    Adult
7  26.0    Adult
8  18.0     Teen
9  21.0    Adult


In [31]:
# Create FamilySize feature as sum of siblings/spouses, parents/children, plus self
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [32]:
# Check the distribution of FamilySize in train data
print(train['FamilySize'].value_counts())

FamilySize
1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: count, dtype: int64


In [33]:
# Show first 10 rows to verify FamilySize column
print(train[['SibSp', 'Parch', 'FamilySize']].head(10))

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1
5      0      0           1
6      0      0           1
7      3      1           5
8      0      2           3
9      1      0           2


In [34]:
# Check the distribution of FamilySize in test data
print(test['FamilySize'].value_counts())

FamilySize
1     253
2      74
3      57
4      14
5       7
7       4
11      4
6       3
8       2
Name: count, dtype: int64


In [35]:
# Show first 10 rows in test data for FamilySize verification
print(test[['SibSp', 'Parch', 'FamilySize']].head(10))

   SibSp  Parch  FamilySize
0      0      0           1
1      1      0           2
2      0      0           1
3      0      0           1
4      1      1           3
5      0      0           1
6      0      0           1
7      1      1           3
8      0      0           1
9      2      0           3


In [36]:
# Extract Title from Name using regex
import re

In [37]:
#get_title function extracts the title (e.g., Mr., Mrs., Miss, Dr.) from a person's name.
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name) # looks for a space followed by letters and a period in the name string.
    #If found, it returns the matched title word; if not, it returns an empty string.
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
test['Title'] = test['Name'].apply(get_title)

In [38]:
# Check the first few extracted titles in train and test sets
print(train[train['Title'] == 'Master'][['Name', 'Title']].head(10))
print("========================================")
print(test[['Name', 'Title']].head(10))

                                   Name   Title
7        Palsson, Master. Gosta Leonard  Master
16                 Rice, Master. Eugene  Master
50           Panula, Master. Juha Niilo  Master
59   Goodwin, Master. William Frederick  Master
63                Skoog, Master. Harald  Master
65             Moubarek, Master. Gerios  Master
78        Caldwell, Master. Alden Gates  Master
125        Nicola-Yarred, Master. Elias  Master
159          Sage, Master. Thomas Henry  Master
164        Panula, Master. Eino Viljami  Master
                                           Name Title
0                              Kelly, Mr. James    Mr
1              Wilkes, Mrs. James (Ellen Needs)   Mrs
2                     Myles, Mr. Thomas Francis    Mr
3                              Wirz, Mr. Albert    Mr
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   Mrs
5                    Svensson, Mr. Johan Cervin    Mr
6                          Connolly, Miss. Kate  Miss
7                  Caldwell, Mr. Albert 

In [39]:
print(train[train['Title'] == 'Master'][['Name', 'Title']].head(10))

                                   Name   Title
7        Palsson, Master. Gosta Leonard  Master
16                 Rice, Master. Eugene  Master
50           Panula, Master. Juha Niilo  Master
59   Goodwin, Master. William Frederick  Master
63                Skoog, Master. Harald  Master
65             Moubarek, Master. Gerios  Master
78        Caldwell, Master. Alden Gates  Master
125        Nicola-Yarred, Master. Elias  Master
159          Sage, Master. Thomas Henry  Master
164        Panula, Master. Eino Viljami  Master


In [40]:
# Replace rare titles with 'Rare'
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train['Title'] = train['Title'].replace(rare_titles, 'Rare')
test['Title'] = test['Title'].replace(rare_titles, 'Rare')

In [41]:
print(train['Title'].value_counts())
print(test['Title'].value_counts())

Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       23
Mlle        2
Mme         1
Ms          1
Name: count, dtype: int64
Title
Mr        240
Miss       78
Mrs        72
Master     21
Rare        6
Ms          1
Name: count, dtype: int64


In [42]:
# Standardize similar titles
train['Title'] = train['Title'].replace(['Mlle', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace(['Mlle', 'Ms'], 'Miss')