In [None]:
import pandas as pd 
import numpy as np
titanic_data = pd.read_csv('titanic_train.csv')
titanic_data.head()

In [None]:
# Drop irrelevant columns
titanic_data.drop(['PassengerId','Name','Ticket','Cabin'],'columns',inplace=True)
titanic_data.head()

In [None]:
from sklearn import preprocessing
# Convert gender to 0 or 1
label_enc =preprocessing.LabelEncoder()
titanic_data['Sex'] = label_enc.fit_transform(titanic_data['Sex'].astype(str))
titanic_data.head()

In [None]:
# One-hot encoding of 'Embarked' with pd.get_dummies
titanic_data = pd.get_dummies(titanic_data,columns=['Embarked'])
titanic_data.head()

In [None]:

# Find missing values in the data and drop those rows:
print('rows before drop n/a',len(titanic_data))
bool_matrix = titanic_data.isnull() # dataframe with True and False values for each cell in the titanic_data
only_null_filter = bool_matrix.any(axis=1) # is there a True value in any column in each row. returns a pandas Series with index matching index of titcanic dataframe
missing = titanic_data[only_null_filter] # show all rows that has one or more null values
titanic_data = titanic_data.dropna()
print('rows after',len(titanic_data))
titanic_data
pd.options.display.max_rows = None # let me see all rows in the dataframe (can be used with columns too)
bool_matrix
titanic_data

In [None]:
only_null_filter

In [None]:
# what is the best bandwidth to use for our dataset?
# The smaller values of bandwith result in tall skinny kernels & larger values result in short fat kernels.
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(titanic_data)

In [None]:

from sklearn.cluster import MeanShift
analyzer = MeanShift(bandwidth=30) 
analyzer.fit(titanic_data)

In [None]:
# 5 clusters
labels = analyzer.labels_
print(labels)
print('\n\n',np.unique(labels))

In [None]:

#We will add a new column in dataset which shows the cluster the data of a particular row belongs to.

titanic_data['cluster_group'] = np.nan
data_length=len(titanic_data)
for i in range(data_length): # loop 714 rows
    titanic_data.iloc[i,titanic_data.columns.get_loc('cluster_group')] = labels[i] #set the cluster label on each row

titanic_data.head()

In [None]:

#Grouping passengers by Cluster
titanic_cluster_data = titanic_data.groupby(['cluster_group']).mean()
#Count of passengers in each cluster
titanic_cluster_data['Counts'] = pd.Series(titanic_data.groupby(['cluster_group']).size())
titanic_cluster_data