In [12]:
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pandas as pd

In [33]:
style.use('ggplot')

'''
Here is some details on the attributes of the dataset which we are using.

Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''

df = pd.read_excel('/titanic.xls')

In [19]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [20]:
# As you can see there is a lot of non numerical data in here and our algo is already doing unsupervised learning so these non numerical data will only create problems.
# But we can't drop all the non numerical data as some of them like the gender or the cabin columns are important.
# So lets try to convert the significant data to digits.

In [34]:
df.drop(['body','name'], 1, inplace=True)

In [35]:
df.apply(pd.to_numeric, errors='ignore')
df.fillna(0, inplace=True)

In [36]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

In [37]:
# what we are doing above is first cycling through the columns in the dataframe and for the data that is not no. taking set of those values to get the unique values of that column and then indexing those unique values.

In [38]:
df = handle_non_numerical_data(df)
print(df.head())

   pclass  survived  sex      age  ...  cabin  embarked  boat  home.dest
0       1         1    0  29.0000  ...      2         3     2        301
1       1         1    1   0.9167  ...    109         3    12         42
2       1         0    0   2.0000  ...    109         3     0         42
3       1         0    1  30.0000  ...    109         3     0         42
4       1         0    0  25.0000  ...    109         3     0         42

[5 rows x 12 columns]


In [39]:
# Pre Processing our data  and dividing the features and labels respectively.

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

In [40]:
X

array([[-1.54609786, -1.34499549,  0.29131302, ...,  0.62364835,
        -0.43413751,  1.50503864],
       [-1.54609786,  0.74349692, -1.30576934, ...,  0.62364835,
         0.69051319, -0.54992951],
       [-1.54609786, -1.34499549, -1.24416265, ...,  0.62364835,
        -0.65906765, -0.54992951],
       ...,
       [ 0.84191642,  0.74349692,  0.14913935, ..., -1.81687688,
        -0.65906765, -0.88316758],
       [ 0.84191642,  0.74349692,  0.17757408, ..., -1.81687688,
        -0.65906765, -0.88316758],
       [ 0.84191642,  0.74349692,  0.29131302, ...,  0.62364835,
        -0.65906765, -0.88316758]])

In [41]:
clf = KMeans(n_clusters=2)
clf.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [42]:
correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1

print(correct/len(X))

0.7005347593582888


In [None]:
# 