In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

%matplotlib inline

In [None]:
# Set file path
titanic_filepath = '../data/titanic.csv'

# Import data into a data frame.
raw_data = pd.read_csv(filepath_or_buffer=titanic_filepath, delimiter=",")

raw_data.head()

In [None]:
# Get information about the data types.
raw_data.info()

In [None]:
# Checking for missing values
raw_data.isna().sum()

The age data is the only one with any missing data. Will age be a significant determiner of the "Survived" class? Possibly, so it shouldn't necessarily be dropped. The missing data accounts for approximately 20% of the rows, which means it could have a significant impact on our model performance. We may want to revist how we cleaned this data later. 

In [None]:
# As a first approximation, will repalce the missing data with the mean value

clean_data = raw_data.fillna(raw_data["Age"].median())

# Our plot shows a somewhat symetrically distributed distribution.
clean_data["Age"].plot.density();

We know that the name and passenger ID's are going to be unique to each row, and therefore need to be removed.

In [None]:
# Remove unique columns.
basic_feature_data = clean_data.drop(columns=["Name", "PassengerId"])
basic_feature_data.head()

Let us look at the target class distribution of the data set.

In [None]:
# Plot the amount of each target class is within the data.
basic_feature_data["Survived"].value_counts(normalize=True).plot(kind="bar",
                                                    color=["navy", "gold"],
                                                    title="Survival class distribution");

print(basic_feature_data["Survived"].value_counts())



This distribution may reduce the performance of our model, we are going to resample our data so that we have the same amount for both the classes. This will be achieved by undersampling the majority class ("No"/0), we have a reasonable number of minority class (342 samples) and therefore although we will lose some predictive power for "No", our model will generalise better.

In [None]:
# Separate majority and minority classes
df_majority = basic_feature_data[basic_feature_data["Survived"]==0]
df_minority = basic_feature_data[basic_feature_data["Survived"]==1]
 
# Undersample majority class.
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=342,    # to match minority class
                                 random_state=123) # reproducible results


basic_feature_data = pd.concat([df_majority_downsampled, df_minority], axis=0, sort=True)

basic_feature_data = basic_feature_data.reset_index(drop=True)

# Display new class counts
basic_feature_data["Survived"].value_counts()
# 1    576
# 0    576
# Name: balance, dtype: int64


Our "Sex" attribute is categorical, and should be encoded to numerical data.

In [None]:
# Initialise the encoder
one_hot_encoder = OneHotEncoder()

# Make array of encoded sex column
sex_data_array = one_hot_encoder.fit_transform(basic_feature_data[["Sex"]]).toarray()

# Store the different categories
column_names = one_hot_encoder.get_feature_names(['Sex'])

# Create a new data frame with the sex data.
sex_encoded_data = pd.DataFrame(data=sex_data_array, columns=column_names)

sex_encoded_data.head()

Lets separate our target variable from our features and add our new "Sex" columns.

In [None]:
# Assign target to a separate object.
survived_labels = basic_feature_data["Survived"]

# Remove the original categorical data column and the target.
basic_feature_data = basic_feature_data.drop(columns=["Survived", "Sex"])

# Combine the numerical and encoded data into one frame.
basic_feature_data = pd.concat([basic_feature_data, sex_encoded_data], axis=1)

We now need to scale our data. Because we are using a model which uses a distance metric it is important to use normalisation scaling so that all features are comparable. 

In [None]:
# Initialise normalizer
normal_scaler = Normalizer()

# Fit and transform the data with the normalizer.
normalised_feature_data = pd.DataFrame(normal_scaler.fit_transform(X=basic_feature_data), 
                                       columns=[basic_feature_data.columns])

normalised_feature_data.head()

Lets assume that we will get the best performance of our model using all the variables. We now have some feautres ready to use, we need to make a training and test split. 

In [None]:
# Split the data set into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(normalised_feature_data.to_numpy(), 
                                                    survived_labels, 
                                                    test_size=0.2, 
                                                    random_state=123)

Lets first train and evaluate a model using the default arguments, where $K=5$.

In [None]:
# Initialise the classifier object
neighbour_initial_model = KNeighborsClassifier()

# Fit the model to the training data.
neighbour_initial_model.fit(X_train, y_train)

# Predict values on the test set using the trained model.
init_y_pred = neighbour_initial_model.predict(X_test)

In [None]:
# Set the names for the classification report to produce.
target_names = ["No", "Yes"]

# Generate the report using the target test and prediction values.
classif_report = classification_report(y_test, init_y_pred, target_names=target_names)

print(classif_report)

This is a good first attempt, but we could improve the F1 score probably by selecting a better K. 

In [None]:
# Define the parameters and the values we want to search.
parameters = {"n_neighbors":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]}

# Select the model type we have chosen.
neighbour_improved_model = KNeighborsClassifier()

# Set the number of folds we want to have to get 80:20 train/test split.
n_cv = 5

# Define our grid search model to find optimal parameters.
opt_model = GridSearchCV(estimator=neighbour_improved_model, param_grid=parameters, scoring="f1", cv=n_cv)

# Fit our parameter search model.
opt_model.fit(X_train, y_train)

print("\nThe best parameters found are: \n\n", opt_model.best_params_)

# Predict target values based on best model found.
better_y_pred = opt_model.best_estimator_.predict(X_test)

# Generate the report using the target test and predicted values.
classif_report_new = classification_report(y_test, better_y_pred, target_names=target_names)

print(classif_report_new)

We have managed to increase the F1 score by 4% by just changing $K$, but the nearer we get to 100% the harder it is to increase our score as we get closer to the limit of reducible error. Maybe a more flexible model would be useful here or there are better ways to prepare our data. How would you improve this workflow? What is the highest score you can achieve?