### Importing Libraries

In [5]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt


### Importing titanic dataset and setting it as data frame

In [7]:
df = pd.read_excel('titanic.xls')


### making a copy of data frame, So that we can use to test it

In [9]:
original_df = pd.DataFrame.copy(df)


### Let's see the dataset 

In [11]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Lets drop name and body from the data frame

In [12]:
df.drop(['body','name'], 1, inplace=True)

### preprocessing the data

In [13]:
df.fillna(0,inplace=True)


### Creating function to handle non numeric data , as non-numberic data won't be able plot or process in algortihm. It would not give us any good insights

In [15]:
def handle_non_numerical_data(df):
    
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            # set is use to get unique data
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
            df[column] = list(map(convert_to_int,df[column]))

    return df

In [16]:
df = handle_non_numerical_data(df)


In [17]:
X = np.array(df.drop(['survived'], 1).astype(float))
y = np.array(df['survived'])


### Preprocessing is use to scale the sparse data

In [19]:
X = preprocessing.scale(X)

### Declaring Mean Shift object and create a fitment

In [20]:
clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=None, seeds=None)

### Let looks what all we got from Mean Shift


In [21]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

### Now we will use copy of dataframe which we had created before, and would add cluster group to it

In [23]:
original_df['cluster_group']=np.nan

In [24]:
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


### lets check survival rate of each group of cluster we find out

In [27]:
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.3762685402029664, 1: 0.8947368421052632, 2: 0.1111111111111111}


### By above information we can say that people in group 0 has 37% of survival rate that in group 1 is 89% and group 2 is 11%

### lets see how data is been classify in this group

first let's check group having highest percent

In [34]:
print(original_df[ (original_df['cluster_group']==1) ])

     pclass  survived                                               name  \
11        1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
17        1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
35        1         1                           Bowen, Miss. Grace Scott   
49        1         1                 Cardeza, Mr. Thomas Drake Martinez   
50        1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   
66        1         1                        Chaudanson, Miss. Victorine   
103       1         1                      Endres, Miss. Caroline Louise   
111       1         1                     Fortune, Miss. Alice Elizabeth   
112       1         1                         Fortune, Miss. Ethel Flora   
113       1         1                         Fortune, Miss. Mabel Helen   
115       1         0                                  Fortune, Mr. Mark   
116       1         1                Fortune, Mrs. Mark (Mary McDougald)   
183       1 

In [37]:
print(original_df[ (original_df['cluster_group']==0) ])

      pclass  survived                                             name  \
0          1         1                    Allen, Miss. Elisabeth Walton   
1          1         1                   Allison, Master. Hudson Trevor   
2          1         0                     Allison, Miss. Helen Loraine   
3          1         0             Allison, Mr. Hudson Joshua Creighton   
4          1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
...      ...       ...                                              ...   
1304       3         0                             Zabour, Miss. Hileni   
1305       3         0                            Zabour, Miss. Thamine   
1306       3         0                        Zakarian, Mr. Mapriededer   
1307       3         0                              Zakarian, Mr. Ortin   
1308       3         0                               Zimmerman, Mr. Leo   

         sex      age  sibsp  parch  ticket      fare    cabin embarked boat  \
0     female  29.00

In [38]:
print(original_df[ (original_df['cluster_group']==2) ])

      pclass  survived                                               name  \
629        3         0                        Andersson, Mr. Anders Johan   
632        3         0  Andersson, Mrs. Anders Johan (Alfrida Konstant...   
646        3         1  Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...   
831        3         0                     Goodwin, Mr. Charles Frederick   
832        3         0            Goodwin, Mrs. Frederick (Augusta Tyler)   
1106       3         0             Panula, Mrs. Juha (Maria Emilia Ojala)   
1146       3         0               Rice, Mrs. William (Margaret Norton)   
1179       3         0                              Sage, Mr. John George   
1180       3         0                     Sage, Mrs. John (Annie Bullen)   

         sex   age  sibsp  parch    ticket     fare cabin embarked boat  \
629     male  39.0      1      5    347082  31.2750   NaN        S  NaN   
632   female  39.0      1      5    347082  31.2750   NaN        S  NaN   
646 

### From above data we can find following insights

### From Cluster 1, We find that survival rate for Pclass = 1 has higher survival rate where as from cluster 2 we can say that Pclass = 3 lower survival rate
