# Mean shift clustering

Look for groupings of Titanic passengers with similar characteristics

In [8]:
import math
import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

%pylab inline
pylab.rcParams['figure.figsize'] = (15, 6)

# Do not use normal form (scietific notation) when printing numbers, exponents can make it harder to compare values
pd.set_option('float_format', '{:f}'.format)

Populating the interactive namespace from numpy and matplotlib


In [3]:
titanic_data = pd.read_csv("../datasets/kaggle/titanic/train.csv", quotechar='"')

## Explore

In [4]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [11]:
titanic_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## Prepare

Remove features that are too specific to individual passengers to be useful when looking for patterns

In [5]:
titanic_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], "columns", inplace=True)

In [7]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Convert Sex to numeric

In [13]:
le = preprocessing.LabelEncoder()
titanic_data["Sex"] = le.fit_transform(titanic_data["Sex"].astype(str))
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


One hot encode the _Embarked_ feature

In [14]:
titanic_data = pd.get_dummies(titanic_data, columns=["Embarked"])
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


Look for any null values

In [15]:
titanic_data[titanic_data.isnull().any(axis=1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
5,0,3,1,,0,0,8.458300,0,1,0
17,1,2,1,,0,0,13.000000,0,0,1
19,1,3,0,,0,0,7.225000,1,0,0
26,0,3,1,,0,0,7.225000,1,0,0
28,1,3,0,,0,0,7.879200,0,1,0
29,0,3,1,,0,0,7.895800,0,0,1
31,1,1,0,,1,0,146.520800,1,0,0
32,1,3,0,,0,0,7.750000,0,1,0
36,1,3,1,,0,0,7.229200,1,0,0
42,0,3,1,,0,0,7.895800,1,0,0


Drop any rows with null values

In [16]:
titanic_data = titanic_data.dropna()

## Train

In [29]:
from sklearn.cluster import MeanShift

analyser = MeanShift(bandwidth=30)
analyser.fit(titanic_data)

MeanShift(bandwidth=30, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

Estimate a good value for the bandwidth based on the data, this is called under the hood if no bandwidth is specified

In [30]:
from sklearn.cluster import estimate_bandwidth

estimate_bandwidth(titanic_data)

30.446971266796215

In [31]:
labels = analyser.labels_

See how many clusters the data was distributed into.  
A bandwidth of **50** produces **3** clusters - every point is assigned to one of these clusters.  
A bandwidth of **30** produces **5** clusters - every point is assigned to one of these clusters.  
Each of these groups will contain passengers with similar characteristics.

In [32]:
np.unique(labels)

array([0, 1, 2, 3, 4], dtype=int64)

Add a cluster group column

In [33]:
titanic_data["cluster_group"] = np.nan

data_length = len(titanic_data)

for i in range(data_length):
    titanic_data.iloc[i, titanic_data.columns.get_loc("cluster_group")] = labels[i]

In [40]:
len(titanic_data)

714

In [34]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
0,0,3,1,22.0,1,0,7.25,0,0,1,0.0
1,1,1,0,38.0,1,0,71.2833,1,0,0,1.0
2,1,3,0,26.0,0,0,7.925,0,0,1,0.0
3,1,1,0,35.0,1,0,53.1,0,0,1,1.0
4,0,3,1,35.0,0,0,8.05,0,0,1,0.0


## Evaluate

Group passengers by cluster and see how similar the clusters are

In [35]:
titanic_cluster_data = titanic_data.groupby(["cluster_group"]).mean()
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.336918,2.52509,0.679211,28.25672,0.439068,0.370968,15.434139,0.121864,0.046595,0.831541
1.0,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963
2.0,0.733333,1.0,0.366667,32.430667,0.6,0.866667,131.183883,0.5,0.0,0.5
3.0,0.733333,1.0,0.266667,30.333333,1.0,1.333333,239.99194,0.533333,0.0,0.466667
4.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0


View the number of samples in each cluster

In [36]:
titanic_cluster_data["Counts"] = titanic_data.groupby(["cluster_group"]).size()
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0.336918,2.52509,0.679211,28.25672,0.439068,0.370968,15.434139,0.121864,0.046595,0.831541,558
1.0,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963,108
2.0,0.733333,1.0,0.366667,32.430667,0.6,0.866667,131.183883,0.5,0.0,0.5,30
3.0,0.733333,1.0,0.266667,30.333333,1.0,1.333333,239.99194,0.533333,0.0,0.466667,15
4.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0,3


Look at more detailed information on a single cluster

In [37]:
titanic_data[titanic_data["cluster_group"] == 1].describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
count,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0,108.0
mean,0.611111,1.296296,0.527778,36.148148,0.814815,0.5,65.622688,0.333333,0.018519,0.62963,1.0
std,0.489771,0.645028,0.501555,14.919607,1.086434,0.971558,15.634315,0.473602,0.135445,0.485155,0.0
min,0.0,1.0,0.0,1.0,0.0,0.0,34.6542,0.0,0.0,0.0,1.0
25%,0.0,1.0,0.0,24.0,0.0,0.0,52.5542,0.0,0.0,0.0,1.0
50%,1.0,1.0,1.0,35.0,1.0,0.0,65.0,0.0,0.0,1.0,1.0
75%,1.0,1.0,1.0,48.0,1.0,1.0,78.9375,1.0,0.0,1.0,1.0
max,1.0,3.0,1.0,71.0,5.0,6.0,93.5,1.0,1.0,1.0,1.0


View all the passengers in this cluster

In [38]:
titanic_data[titanic_data["cluster_group"] == 1]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
1,1,1,0,38.000000,1,0,71.283300,1,0,0,1.000000
3,1,1,0,35.000000,1,0,53.100000,0,0,1,1.000000
6,0,1,1,54.000000,0,0,51.862500,0,0,1,1.000000
34,0,1,1,28.000000,1,0,82.170800,1,0,0,1.000000
35,0,1,1,42.000000,1,0,52.000000,0,0,1,1.000000
52,1,1,0,49.000000,1,0,76.729200,1,0,0,1.000000
54,0,1,1,65.000000,0,1,61.979200,1,0,0,1.000000
59,0,3,1,11.000000,5,2,46.900000,0,0,1,1.000000
61,1,1,0,38.000000,0,0,80.000000,0,0,0,1.000000
62,0,1,1,45.000000,1,0,83.475000,0,0,1,1.000000
