In [1]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
import matplotlib.pyplot as plt



In [2]:
# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('titanic.xls')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

In [3]:
def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

In [4]:
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

In [5]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,boat
0,1,1,1,29.0,0,0,211.3375,168,3,1
1,1,1,0,0.9167,1,2,151.55,39,3,6
2,1,0,1,2.0,1,2,151.55,39,3,0
3,1,0,0,30.0,1,2,151.55,39,3,0
4,1,0,1,25.0,1,2,151.55,39,3,0


In [6]:
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [9]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

# print(len(set(labels)), cluster_centers)

original_df['cluster_group']=np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.36910569105691055, 1: 0.6140350877192983, 2: 1.0, 3: 0.6666666666666666, 4: 0.1}


In [21]:
# print(original_df[ (original_df['cluster_group']==0) ].describe())
# print(original_df[ (original_df['cluster_group']==1) ].describe())
# print(original_df[ (original_df['cluster_group']==4) ].describe())
print(original_df[ (original_df['cluster_group']==2) ].describe())

       pclass  survived        age  sibsp     parch        fare  body  \
count     6.0       6.0   6.000000    6.0  6.000000    6.000000   0.0   
mean      1.0       1.0  40.833333    0.0  0.333333  429.011133   NaN   
std       0.0       0.0   9.239408    0.0  0.516398  129.075794   NaN   
min       1.0       1.0  35.000000    0.0  0.000000  262.375000   NaN   
25%       1.0       1.0  35.250000    0.0  0.000000  324.863550   NaN   
50%       1.0       1.0  36.000000    0.0  0.000000  512.329200   NaN   
75%       1.0       1.0  42.750000    0.0  0.750000  512.329200   NaN   
max       1.0       1.0  58.000000    0.0  1.000000  512.329200   NaN   

       cluster_group  
count            6.0  
mean             2.0  
std              0.0  
min              2.0  
25%              2.0  
50%              2.0  
75%              2.0  
max              2.0  
