In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#Reading the file
df=pd.read_csv(r'file-name') # reading the data csv file
df.head()

#Initial analysis of the dataset
print('Median fare of survived', df[df['survived']==1]['fare'].median())
print('Median fare of not survived', df[df['survived']==0]['fare'].median())
print('Number of Unique home destinations=',len(df['home.dest'].unique()))
print('Number of Unique tickets=',len(df['ticket'].unique()))
print('Number of Unique cabins=',len(df['cabin'].unique()))

#Encoding sex due to it being categorical
df['sex']=df['sex'].replace({'male':0,'female':1})

#Removing or dropping irrelevant columns
df1= df.drop(['boat','name','ticket','cabin','home.dest'],axis=1)
df1.head()

#Checking for NaN (Not a number) values to ensure a clean dataset
df1.isnull().sum()

#Null value imputation for features that show presence of NaN
df1['age'].fillna(df1['age'].median(),inplace=True)
df1['embarked'].fillna(df1['embarked'].mode()[0],inplace=True)

#Rechecking the data
df1.isnull().sum()

#Transform the last categorical feature
df1['embarked']=df1['embarked'].replace({'C':0,'S':1,'Q':2})

#Final check of dataset to ensure cleanliness of data from nulls values, encoding categorical features, and checking of datatypes
df1.info()
df1.head()

#Transforming all values in dataset to common scale to ensure model remains unbiased and does not automatically weigh bigger numbers
x1=df1.iloc[:,[1,2,3,4,5]].values

from sklearn import preprocessing
X = preprocessing.scale(x1)

#Crafting the model
from sklearn.cluster import KMeans
y = np.array(df['survived'])
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

#Checking for accuracy
correct = 0
for i in range(len(x1)):
predict_me = np.array(x1[i].astype(float))
predict_me = predict_me.reshape(-1, len(predict_me))
prediction = kmeans.predict(predict_me)
if prediction[0] == y[i]:
correct += 1

print(correct/len(X))

#Prediction of Survival based on the dataset
pred=kmeans.predict(X)

#Plotting of clusters
plt.scatter(X[pred == 0, 0], X[pred == 0, 1],
s = 30, c = 'red', label = 'dead')
plt.scatter(X[pred == 1, 0], X[pred == 1, 1],
s = 30, c = 'blue', label = 'survived')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1],
s = 30, c = 'yellow', label = 'Centroids')
plt.legend()
plt.show()

#Checking prediction numbers against actual
a=df['survived'].value_counts().values
b=[len(pred[pred==0]),len(pred[pred==1])]
check=pd.DataFrame({'Actual':a,'Predicted':b},columns=['Actual','Predicted'])
check

print('Model accuracy is: %.2f'%((correct/len(X))*100),'%')