In [1]:
!pip install seaborn --upgrade

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Added these..
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Read in data

In [1]:
# Read in the data
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Concatenate DataFrames to process features easily...
data = pd.concat(objs =[train,test], axis=0).reset_index(drop=True)

print(f'Exemplars: {data.shape[0]}, features: {data.shape[1]}')
data.head()

## 2. Handle missing data 

In [1]:
# Handle missing data...
data.isna().sum()

* Note survived missing values corresponds to the test set.

### 2.1 Cabin data

In [1]:
# I would like to transform the cabin data to contain the letter component as this likely
# refers to the area of the ship passengers stayed in which is likely to relate to class...

# Check all data we have is of this format...
for cabin in np.unique(data['Cabin'][data['Cabin'].notnull()]):
    if cabin[0].isalpha() != True:
        print('Not all cabins begin with letter.')
        break

# String away letters from known cabins...
data["Cabin"] = data['Cabin'].apply(lambda x: x if pd.isnull(x) else x[0])

# Visulise letter frequency for each passenger class...
freq_dict = {}
fig = plt.figure(figsize = (25,10))
for c in np.unique(data['Pclass']):
    ax = fig.add_subplot(1, 3, c)
    sns.countplot(x='Cabin', data=data[data['Pclass']==c], ax=ax, \
                  order = np.unique(data.loc[(data.Pclass == c),'Cabin'].dropna()));
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    ax.set_xlabel('Cabin', fontsize=25)
    ax.set_ylabel('Count', fontsize=25)
    ax.set_title(f'Class {c}', fontsize=30)
    freqs = []
    [freqs.append(patch.get_height()) for patch in ax.patches]
    freqs = np.array(freqs)    
    freqs = freqs / freqs.sum()
    freq_dict[c] = freqs
    

In [1]:
# Fill in missing values with cabins from drawn from distribution of cabins by class...
np.random.seed(1)
for c in np.unique(data['Pclass'].dropna()):
    choices = np.unique(data.loc[(data.Pclass == c),'Cabin'].dropna())
    data.loc[(data.Pclass == c),'Cabin'] = data['Cabin'].apply(lambda \
                        x: np.random.choice(choices, p=freq_dict[c]) if pd.isnull(x) else x)
sns.countplot(x='Cabin', data=data).set_title('Cabin distribution with updated values');


### 2.2 Age missing data

In [1]:
# Visualise age distributions by class...
fig = plt.figure(figsize = (25,10))
for c in np.unique(data['Pclass']):
    ax = fig.add_subplot(1, 3, c)
    sns.histplot(x='Age', data=data[data['Pclass']==c], ax=ax);
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    ax.set_xlabel('Age', fontsize=25)
    ax.set_ylabel('Count', fontsize=25)
    ax.set_title(f'Class {c}', fontsize=30)

In [1]:
# Fill missing age values with median age according to class...
np.random.seed(1)
for c in np.unique(data['Pclass'].dropna()):
    data.loc[(data.Pclass == c),'Age'] = data['Age'].apply(lambda \
                        x: data.loc[(data.Pclass == c),'Age'].median() if pd.isnull(x) else x)

### 2.3 Fare missing data

In [1]:
# Visualise fare distributions by class...
fig = plt.figure(figsize = (25,10))
for c in np.unique(data['Pclass']):
    ax = fig.add_subplot(1, 3, c)
    sns.histplot(x='Fare', data=data[data['Pclass']==c], ax=ax);
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    ax.set_xlabel('Fare', fontsize=25)
    ax.set_ylabel('Count', fontsize=25)
    ax.set_title(f'Class {c}', fontsize=30)

In [1]:
# Fill missing fare values with mean age according to class...
np.random.seed(1)
for c in np.unique(data['Pclass'].dropna()):
    data.loc[(data.Pclass == c),'Fare'] = data['Fare'].apply(lambda \
                        x: data.loc[(data.Pclass == c),'Fare'].mean() if pd.isnull(x) else x)

### 2.4 Embarked missing data

In [1]:
# Fill embarked missing data with mode...
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

In [1]:
# Check missing data has been handled
data.isna().sum()

## 3. Convert categorical features

### 3.1 Encoding ticket data

In [1]:
# Handle ticket data...
data['Ticket'].head()

# I assume the letters preceeding the ticket numbers relate the booking company/method, so I extract
# these to use as a feature...

In [1]:
data["Ticket"] = data['Ticket'].apply(lambda x: 'Z' \
                            if x.isdigit()  else x.replace('.', ' ').replace('/', ' ').strip().split()[0])

In [1]:
ticket_dummies = pd.get_dummies(data[['Ticket']])
data = pd.concat(objs=[data, ticket_dummies], axis=1)
data = data.drop(['Ticket'], axis=1)

### 3.2 Encoding name feature

In [1]:
# Extract titles from names...
data['Title'] = data['Name'].apply(lambda x: x.split('.')[0].split(' ')[-1])
data['Title'].value_counts()

In [1]:
# Group upper class (UC) titles...
data["Title"] = data["Title"].replace(
    ['Lady','Countess','Capt', 'Mme', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle'], 
    'UC')

# Group Ms and Mrs
data["Title"] = data["Title"].replace(
    'Ms', 
    'Mrs'
)
data['Title'].value_counts()

In [1]:
# Visualise survival probability of each title
sns.catplot(x="Title", y="Survived", data=data, kind="bar").set_ylabels("Survival probability");

In [1]:
title_dummies = pd.get_dummies(data[['Title']])
data = pd.concat(objs=[data, title_dummies], axis=1)
data = data.drop(['Title'], axis=1)
data = data.drop(['Name'], axis=1)

### 3.3 Encoding sex feature

In [1]:
# Transform male/female to 0/1...
data["Sex"] = data['Sex'].apply(lambda x: 0 if x == 'male' else 1)

### 3.4 Encoding embarked feature

In [1]:
embrkd_dummies = pd.get_dummies(data[['Embarked']])
data = pd.concat(objs=[data, embrkd_dummies], axis=1)
data = data.drop(['Embarked'], axis=1)

### 3.5 Combine and encode siblings/spouse (SibSp) and parents/childen (Parch) features

In [1]:
# Create family size feature...
data["fam_size"] = data["SibSp"] + data["Parch"]

data['Single'] = data['fam_size'].apply(lambda x: 1 if x == 0 else 0)
data['Sib/Sp'] = data['SibSp'].apply(lambda x: 1 if x >= 1  else 0)
data['Par/Ch'] = data['Parch'].apply(lambda x: 1 if x >= 1 else 0)

data = data.drop(['SibSp'], axis=1)
data = data.drop(['Parch'], axis=1)

### 3.6 Encode cabin feature

In [1]:
cabin_dummies = pd.get_dummies(data[['Cabin']])
data = pd.concat(objs=[data, cabin_dummies], axis=1)
data = data.drop(['Cabin'], axis=1)

### 3.7 Encode class feature

In [1]:
cabin_dummies = pd.get_dummies(data[['Pclass']])
data = pd.concat(objs=[data, cabin_dummies], axis=1)
data = data.drop(['Pclass'], axis=1)

### 3.8 Transform fare to log space

In [1]:
# Fare data is left skewed...
sns.displot(data["Fare"], color="m", label="Skewness : %.2f"%(data["Fare"].skew()));
plt.legend();

In [1]:
# Take logs (plus 1 to handle 0s) of fares...
data['Fare'] = np.log1p(data['Fare'])
sns.displot(data["Fare"], color="m", label="Skewness : %.2f"%(data["Fare"].skew()));
plt.legend();

### 3.9 Standardise fare and age data 

In [1]:
data['Fare'] = (data['Fare'] - data['Fare'].mean()) / data['Fare'].std()
data['Age'] = (data['Age'] - data['Age'].mean()) / data['Age'].std()

## 4. Evaluate model

In [1]:
# Passenger ID isn't useful to remove...
# Extract test indicies...
test_indicies = data['PassengerId'][train.shape[0]:]
data = data.drop(['PassengerId'], axis=1)


train = data[:train.shape[0]]
test = data[train.shape[0]:]

# Split training data in train and validate
split = int(train.shape[0] * 0.85)
x =  np.array(train.loc[:, train.columns != 'Survived'])
y = np.array(train['Survived'])
x_train = x[:split]
x_val = x[split:]
y_train = y[:split]
y_val = y[split:]
x_test =  np.array(test.loc[:, test.columns != 'Survived'])

In [1]:
from scipy import spatial, stats

# Define k_means algorithm...
def k_means(
        x_train,
        y_train,
        x_val,
        y_val,
        x_test,
        k,
        steps=10):
    """ Fit and evaluate k-means clusters on data"""
    
    assert k <= x_train.shape[0], "K must be less than the number of traning data."

    # Force k to be an integer...
    k = int(k)

    # Choose k random clusters to begin...
    np.random.seed(1)
    index = np.random.choice(x_train.shape[0], k, replace=False)
    centroids = x_train[index, :]

    # Find indicies of data closest to each centroid...
    P = np.argmin(spatial.distance.cdist(x_train, centroids, 'euclidean'), axis=1)

    for _ in range(steps):
        # Create new centroids at the means of each centroid...
        centroids = np.vstack([np.mean(x_train[P == i, :], axis=0)
                               for i in range(k) if len(x_train[P == i, :]) != 0])

        # Find indicies of data closests to each (new) centroid...
        temp_index = np.argmin(
            spatial.distance.cdist(
                x_train,
                centroids,
                'euclidean'),
            axis=1)

        # If no data changes centroid, stop...
        if np.array_equal(P, temp_index):
            break

        # Repeat process with new indicies...
        P = temp_index

    # Find the predictions (weighted average) from each cluster...
    P = np.argmin(spatial.distance.cdist(x_train, centroids, 'euclidean'), axis=1)
    distances = np.min(spatial.distance.cdist(x_train, centroids, 'euclidean'), axis=1)
    cluster_preds = np.zeros(k)
    cluster_preds = np.stack([stats.mode(y_train[P == i])[0] if len(y_train[P == i]) > 0 else np.array([0]) for i in range(k)])

    # Find indicies of data closest to each final centroid for test and train
    # data...

    index_final_train = np.argmin(
        spatial.distance.cdist(
            x_train,
            centroids,
            'euclidean'),
        axis=1)
    
    index_final_val = np.argmin(
        spatial.distance.cdist(
            x_val,
            centroids,
            'euclidean'),
        axis=1)
    
    index_final_test = np.argmin(
        spatial.distance.cdist(
            x_test,
            centroids,
            'euclidean'),
        axis=1)
    pred_train = np.zeros(x_train.shape[0])
    pred_val = np.zeros(x_val.shape[0])
    pred_test = np.zeros(x_test.shape[0]) 
    
    for i in range(k):
        pred_train[index_final_train == i] = cluster_preds[i]
        pred_val[index_final_val == i] = cluster_preds[i]
        pred_test[index_final_test == i] = cluster_preds[i]
       
    
    # Calculate accuracy on train and validation data...
    train_accuracy = sum([1 for i, p in enumerate(y_train) if pred_train[i] == p]) / y_train.shape[0]
    val_accuracy = sum([1 for i, p in enumerate(y_val) if pred_val[i] == p]) / y_val.shape[0]
    
    results = {'Train accuracy': train_accuracy,
               'Validation accuracy': val_accuracy,
               'test_preds': pred_test}

    return results

In [1]:
# Use brute force to optimise k...
ks = np.arange(1,x_train.shape[0])
res = np.zeros((len(ks),2))
best_accuracy = 0
opt_preds = None
opt_k = None
for i, k in enumerate(ks):
    results = k_means(x_train, y_train, x_val, y_val, x_test, k)
    res[i,0] = results['Train accuracy']
    res[i,1] = results['Validation accuracy']
    if results['Validation accuracy'] > best_accuracy:
        opt_preds = results['test_preds']
        opt_k = k
        best_accuracy = results['Validation accuracy']
print(f"Best accuracy of {best_accuracy*100:.02f} of validation data using a K of {opt_k}")        

In [1]:
# Visualise accuracy of each k...
plt.plot(ks, res[:,0], label = 'Train');
plt.plot(ks, res[:,1], label = 'Validate');
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend();

In [1]:
# Format and write results...
out = pd.DataFrame(test_indicies).reset_index(drop=True)
out['Survived'] = opt_preds.astype(int)
out.head()

In [1]:
out.to_csv("Titanic survival submission (kmeans).csv",index=False)