In [1]:
import numpy as np
import pandas as pd
import re
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
# Read in titanic training data
titanic_df = pd.read_csv("../_Data/train.csv")
titanic_testing_df = pd.read_csv("../_Data/test.csv")

In [3]:
# Append training and test data
titanic_combined_df = titanic_df.append(titanic_testing_df)

In [4]:
# Populate the title feature for each person
titanic_combined_df["Title"] = ""

def extract_name(name):
    if "Miss." in name:
        return "1"
    elif "Master." in name:
        return "2"
    elif any(x in name for x in ['Mrs.', 'Ms.', 'Mme.', 'Lady', 'Mlle', 'Countess', 'Dona']):
        return "3"
    elif any(x in name for x in ['Mr.', 'Don.', 'Dr.', 'Rev.', 'Major', 'Sir', 'Col', 'Capt', 'Jonkheer']):
        return "4"
    else:
        return ""

titanic_combined_df['Title'] = titanic_combined_df.apply(lambda row: extract_name(row['Name']), axis=1)

In [5]:
# Convert Genders to numbers
titanic_combined_df = titanic_combined_df.replace('female', 0)
titanic_combined_df = titanic_combined_df.replace('male', 1)

In [6]:
# Create a cleaned dataset to work from
titanic_cleaned_df = titanic_combined_df
titanic_cleaned_df = titanic_cleaned_df.drop(['Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'SibSp', 'Survived', 'Ticket'], axis=1)

In [7]:
# Split the dataset into the set without labels, and set with labels for training/validating/testing
titanic_no_age_labels = titanic_cleaned_df[titanic_cleaned_df['Age'].isna()]
titanic_with_age_labels = titanic_cleaned_df[titanic_cleaned_df['Age'] > 0]
titanic_with_age_labels = titanic_with_age_labels.drop(['PassengerId'], axis=1)

In [8]:
# Split the labelled dataset into features and labels
titanic_features = titanic_with_age_labels[['Pclass', 'Sex', 'Title']]
titanic_labels = titanic_with_age_labels[['Age']]

In [29]:
# Split features and labels into training and testing sets
features_train, features_test, labels_train, labels_test = train_test_split(titanic_features, titanic_labels, test_size=0.10, random_state=10)

In [30]:
# Grid Search for Decision Tree Regressor
dtr_grid_search = tree.DecisionTreeRegressor()
param_grid = {'criterion': ['mse', 'friedman_mse'], 'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}
dtr_grid_search = GridSearchCV(dtr_grid_search, param_grid, verbose=0, n_jobs=-1)
dtr_grid_search.fit(features_train, labels_train)
dtr_grid_search.best_params_
print(dtr_grid_search.best_params_)
print(f'Best score: {dtr_grid_search.best_score_}')

{'criterion': 'friedman_mse', 'min_samples_split': 8}
Best score: 0.38656024543987605


In [31]:
# Grid Search for Linear Regression
lr_grid_search = tree.DecisionTreeRegressor()
param_grid = {}
lr_grid_search = GridSearchCV(lr_grid_search, param_grid, verbose=0, n_jobs=-1)
lr_grid_search.fit(features_train, labels_train)
lr_grid_search.best_params_
print(lr_grid_search.best_params_)
print(f'Best score: {lr_grid_search.best_score_}')

{}
Best score: 0.3864970636073869


In [32]:
# Decision Tree Regressor for testing data
dtr = tree.DecisionTreeRegressor(min_samples_split=8)
dtr = dtr.fit(features_train, labels_train)
labels_predict = dtr.predict(features_test)
regression_score_testing_date = dtr.score(features_test, labels_test)
print(f"The score (r^2) for the testing dataset: {regression_score_testing_date:.3f}")

The score (r^2) for the testing dataset: 0.440


In [33]:
# Predict the dataset that doesn't have ages
titanic_no_age_labels_prepared = titanic_no_age_labels.drop(['PassengerId', 'Age'], axis=1)
labels_no_age_predict = dtr.predict(titanic_no_age_labels_prepared)

In [34]:
# Create a new dataframe out of the age label predictions
predicted_age_labels = pd.DataFrame(labels_no_age_predict, index=None, columns=['Age_Pred'])

In [35]:
# Reset the dataframe indexes so that age transers appear in correct order
titanic_no_age_labels = titanic_no_age_labels.reset_index(drop=True)

In [36]:
# Copy the predicted age labels to the actual dataset without age labels
titanic_no_age_labels['Age_Pred'] = predicted_age_labels['Age_Pred']

In [37]:
# Drop the unnecessary columns in preparation for the join
titanic_no_age_labels = titanic_no_age_labels.drop(['Pclass', 'Sex', 'Title', 'Age'], axis=1)

In [38]:
# Copy the predicted ages over to the original imported datasets
titanic_df_working = titanic_df
titanic_df_working = titanic_df_working.set_index('PassengerId').join(titanic_no_age_labels.set_index('PassengerId'))
titanic_testing_df_working = titanic_testing_df
titanic_testing_df_working = titanic_testing_df_working.set_index('PassengerId').join(titanic_no_age_labels.set_index('PassengerId'))

In [39]:
# Add predicted ages into the 'Age' columns with rounding to 3 decimal points (this distinguishes the age predictions vs what was provided)
def predicted_ages(age, age_pred):
    if age_pred > 0:
        return round(age_pred, 3)
    else:
        return age

titanic_df_working['Age'] = titanic_df_working.apply(lambda row: predicted_ages(row['Age'], row['Age_Pred']), axis=1)
titanic_testing_df_working['Age'] = titanic_testing_df_working.apply(lambda row: predicted_ages(row['Age'], row['Age_Pred']), axis=1)

In [None]:
# Drop the 'Age_Pred' column and set to final variables
titanic_df_final = titanic_df_working.drop(['Age_Pred'], axis=1)
titanic_testing_df_final = titanic_testing_df_working.drop(['Age_Pred'], axis=1)

In [None]:
# Write to CSV files
titanic_df_final.to_csv('train_with_ages.csv')
titanic_testing_df_final.to_csv('test_with_ages.csv')