In [None]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination

'''

In [None]:
df = pd.read_excel(r"C:\Users\samer\Desktop\Machine Learning\Complementary Stuff\titanic.xls")

In [None]:
original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

In [None]:
def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

In [None]:
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

In [None]:
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

In [None]:
clf = MeanShift()
clf.fit(X)

In [None]:
# Now that we've created the fitment, we can get some attributes from our clf object:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

In [None]:
# Next, we're going to add a new column to our original dataframe:
original_df['cluster_group']=np.nan

In [None]:
# Now, we can iterate through the labels and populate the labels to the empty column:
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

In [27]:
# Next, we can check the survival rates for each of the groups we happen to find:
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.36998394863563405, 1: 0.6938775510204082, 2: 1.0, 3: 0.1}


In [None]:
# Again, you may get more groups. I got three here, but I've personally got 
# up to six groups on this same dataset. Right away, we see that group 0 has 
# a 38% survival rate, group 1 has a 91% survival rate, and group 2 has a 10%
# survival rate. This is somewhat curious as we know there were three actual
# "passenger classes" on the ship. I immediately wonder if 0 is the second-
# class group, 1 is first-class, and 2 is 3rd class. The classes on the ship 
# were ordered with 3rd class on the bottom, and first class on the top. The 
# bottom flooded first, and the top is where the life-boats were. I can look 
# deeper by doing:

In [16]:
print(original_df[ (original_df['cluster_group']==0) ])

      pclass  survived                                           name     sex  \
0          1         1                  Allen, Miss. Elisabeth Walton  female   
5          1         1                            Anderson, Mr. Harry    male   
6          1         1              Andrews, Miss. Kornelia Theodosia  female   
7          1         0                         Andrews, Mr. Thomas Jr    male   
8          1         1  Appleton, Mrs. Edward Dale (Charlotte Lamson)  female   
...      ...       ...                                            ...     ...   
1304       3         0                           Zabour, Miss. Hileni  female   
1305       3         0                          Zabour, Miss. Thamine  female   
1306       3         0                      Zakarian, Mr. Mapriededer    male   
1307       3         0                            Zakarian, Mr. Ortin    male   
1308       3         0                             Zimmerman, Mr. Leo    male   

       age  sibsp  parch  t

In [17]:
print(original_df[ (original_df['cluster_group']==1) ])

     pclass  survived                                               name  \
1         1         1                     Allison, Master. Hudson Trevor   
2         1         0                       Allison, Miss. Helen Loraine   
3         1         0               Allison, Mr. Hudson Joshua Creighton   
4         1         0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
10        1         0                             Astor, Col. John Jacob   
11        1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
16        1         0                           Baxter, Mr. Quigg Edmond   
17        1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
23        1         1                              Bidois, Miss. Rosalie   
24        1         1                                  Bird, Miss. Ellen   
35        1         1                           Bowen, Miss. Grace Scott   
57        1         1          Carter, Mrs. William Ernest (Lucile Polk)   
66        1 

In [18]:
print(original_df[ (original_df['cluster_group']==2) ])

     pclass  survived                                               name  \
49        1         1                 Cardeza, Mr. Thomas Drake Martinez   
50        1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   
183       1         1                             Lesurer, Mr. Gustave J   
302       1         1                                   Ward, Miss. Anna   

        sex   age  sibsp  parch    ticket      fare        cabin embarked  \
49     male  36.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
50   female  58.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
183    male  35.0      0      0  PC 17755  512.3292         B101        C   
302  female  35.0      0      0  PC 17755  512.3292          NaN        C   

    boat  body                                       home.dest  cluster_group  
49     3   NaN  Austria-Hungary / Germantown, Philadelphia, PA            2.0  
50     3   NaN                    Germantown, Philadelphia, PA           

In [19]:
print(original_df[ (original_df['cluster_group']==3) ])

      pclass  survived                                               name  \
629        3         0                        Andersson, Mr. Anders Johan   
632        3         0  Andersson, Mrs. Anders Johan (Alfrida Konstant...   
644        3         0         Asplund, Mr. Carl Oscar Vilhelm Gustafsson   
646        3         1  Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...   
831        3         0                     Goodwin, Mr. Charles Frederick   
832        3         0            Goodwin, Mrs. Frederick (Augusta Tyler)   
1106       3         0             Panula, Mrs. Juha (Maria Emilia Ojala)   
1146       3         0               Rice, Mrs. William (Margaret Norton)   
1179       3         0                              Sage, Mr. John George   
1180       3         0                     Sage, Mrs. John (Annie Bullen)   

         sex   age  sibsp  parch    ticket     fare cabin embarked boat  \
629     male  39.0      1      5    347082  31.2750   NaN        S  NaN   
63

In [20]:
print(original_df[ (original_df['cluster_group']==0) ].describe())

            pclass     survived         age        sibsp        parch  \
count  1246.000000  1246.000000  985.000000  1246.000000  1246.000000   
mean      2.344302     0.369984   29.375888     0.484751     0.302568   
std       0.812203     0.482994   14.164671     1.050229     0.656712   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000   21.000000     0.000000     0.000000   
50%       3.000000     0.000000   28.000000     0.000000     0.000000   
75%       3.000000     1.000000   37.000000     1.000000     0.000000   
max       3.000000     1.000000   80.000000     8.000000     4.000000   

              fare        body  cluster_group  
count  1245.000000  114.000000         1246.0  
mean     25.189116  161.991228            0.0  
std      28.336017   98.488173            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   70.500000            0.0  
50%      13.416700  165.500000            0.0  
75%   

In [21]:
print(original_df[ (original_df['cluster_group']==1) ].describe())

       pclass   survived        age      sibsp      parch        fare  \
count    49.0  49.000000  49.000000  49.000000  49.000000   49.000000   
mean      1.0   0.693878  37.498300   0.836735   1.326531  198.238435   
std       0.0   0.465657  17.606277   0.874332   1.028505   58.926892   
min       1.0   0.000000   0.916700   0.000000   0.000000   79.200000   
25%       1.0   0.000000  24.000000   0.000000   1.000000  151.550000   
50%       1.0   1.000000  38.000000   1.000000   1.000000  211.500000   
75%       1.0   1.000000  50.000000   1.000000   2.000000  262.375000   
max       1.0   1.000000  67.000000   3.000000   4.000000  263.000000   

             body  cluster_group  
count    5.000000           49.0  
mean   104.400000            1.0  
std     36.156604            0.0  
min     45.000000            1.0  
25%     96.000000            1.0  
50%    122.000000            1.0  
75%    124.000000            1.0  
max    135.000000            1.0  


In [23]:
print(original_df[ (original_df['cluster_group']==2) ].describe())

       pclass  survived        age  sibsp    parch      fare  body  \
count     4.0       4.0   4.000000    4.0  4.00000    4.0000   0.0   
mean      1.0       1.0  41.000000    0.0  0.50000  512.3292   NaN   
std       0.0       0.0  11.343133    0.0  0.57735    0.0000   NaN   
min       1.0       1.0  35.000000    0.0  0.00000  512.3292   NaN   
25%       1.0       1.0  35.000000    0.0  0.00000  512.3292   NaN   
50%       1.0       1.0  35.500000    0.0  0.50000  512.3292   NaN   
75%       1.0       1.0  41.500000    0.0  1.00000  512.3292   NaN   
max       1.0       1.0  58.000000    0.0  1.00000  512.3292   NaN   

       cluster_group  
count            4.0  
mean             2.0  
std              0.0  
min              2.0  
25%              2.0  
50%              2.0  
75%              2.0  
max              2.0  


In [24]:
print(original_df[ (original_df['cluster_group']==3) ].describe())

       pclass   survived        age      sibsp      parch       fare  \
count    10.0  10.000000   8.000000  10.000000  10.000000  10.000000   
mean      3.0   0.100000  39.875000   0.800000   6.000000  42.703750   
std       0.0   0.316228   1.552648   0.421637   1.632993  15.590194   
min       3.0   0.000000  38.000000   0.000000   5.000000  29.125000   
25%       3.0   0.000000  39.000000   1.000000   5.000000  31.303125   
50%       3.0   0.000000  39.500000   1.000000   5.000000  35.537500   
75%       3.0   0.000000  40.250000   1.000000   6.000000  46.900000   
max       3.0   1.000000  43.000000   1.000000   9.000000  69.550000   

             body  cluster_group  
count    2.000000           10.0  
mean   234.500000            3.0  
std    130.814755            0.0  
min    142.000000            3.0  
25%    188.250000            3.0  
50%    234.500000            3.0  
75%    280.750000            3.0  
max    327.000000            3.0  


In [25]:
cluster_0 = (original_df[ (original_df['cluster_group']==0) ])
cluster_0_fc = (cluster_0[ (cluster_0['pclass']==1) ])
print(cluster_0_fc.describe())

       pclass    survived         age       sibsp       parch        fare  \
count   270.0  270.000000  231.000000  270.000000  270.000000  270.000000   
mean      1.0    0.600000   39.480519    0.370370    0.188889   61.120016   
std       0.0    0.490808   13.903106    0.520805    0.469781   38.874905   
min       1.0    0.000000    4.000000    0.000000    0.000000    0.000000   
25%       1.0    0.000000   29.000000    0.000000    0.000000   29.700000   
50%       1.0    1.000000   39.000000    0.000000    0.000000   53.100000   
75%       1.0    1.000000   49.000000    1.000000    0.000000   79.650000   
max       1.0    1.000000   80.000000    2.000000    2.000000  227.525000   

             body  cluster_group  
count   30.000000          270.0  
mean   172.566667            0.0  
std     84.511449            0.0  
min     16.000000            0.0  
25%    114.000000            0.0  
50%    173.500000            0.0  
75%    242.250000            0.0  
max    307.000000         

In [26]:
# Here 60% of class 1 individuals in cluster 0 survived

In [29]:
print(original_df[ (original_df['cluster_group']==2) ])

     pclass  survived                                               name  \
49        1         1                 Cardeza, Mr. Thomas Drake Martinez   
50        1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   
183       1         1                             Lesurer, Mr. Gustave J   
302       1         1                                   Ward, Miss. Anna   

        sex   age  sibsp  parch    ticket      fare        cabin embarked  \
49     male  36.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
50   female  58.0      0      1  PC 17755  512.3292  B51 B53 B55        C   
183    male  35.0      0      0  PC 17755  512.3292         B101        C   
302  female  35.0      0      0  PC 17755  512.3292          NaN        C   

    boat  body                                       home.dest  cluster_group  
49     3   NaN  Austria-Hungary / Germantown, Philadelphia, PA            2.0  
50     3   NaN                    Germantown, Philadelphia, PA           

In [None]:
# All the people of this cluster survived (100% survival)