In [4]:
import numpy as np
import pandas as pd
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV

# Loading the data
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# Store test passenger IDs for easy access
PassengerId = test['PassengerId']

# Create new feature Has_Cabin
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Create new feature FamilySize
full_data = [train, test]
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# Create new feature IsAlone
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

# Fill missing embarked values
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

# Fill missing values in Fare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# Fill missing age values 
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)


def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)


for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Mapping gender to numeric values
for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

# Drop unnecessary columns
drop_columns = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize', 'Embarked', 'Title']
for dataset in full_data:
    dataset.drop(drop_columns, axis=1, inplace=True)


X_train = train.drop(['Survived', 'PassengerId'], axis=1)
y_train = train['Survived']

# Initialize Decision Tree Classifier!!
clf = DecisionTreeClassifier()

# Fit the classifier with training data
clf.fit(X_train, y_train)


calibrated_clf = CalibratedClassifierCV(estimator=clf, method='sigmoid', cv=5)
calibrated_clf.fit(X_train, y_train)

# Predict labels and probabilities for test set
X_test = test.drop(['PassengerId'], axis=1)
test['Survived'] = clf.predict(X_test)  # Predicted labels
test['Survival_Probability'] = calibrated_clf.predict_proba(X_test)[:, 1]  # Probability of survival

# Create submission DataFrame
submission = test[['PassengerId', 'Survived', 'Survival_Probability']]

# Save the DataFrame to a new CSV file
submission.to_csv('new_submission_with_true_labels.csv', index=False)

