# Titanic Data Analysis with KMEAN


Import required modules, read csv

In [111]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans
from pprint import pprint

TITANIC_TRAIN = "train.csv"
TITANIC_TEST = "test.csv"

titanic_dataframe = pd.read_csv(TITANIC_TRAIN)
test_dataframe = pd.read_csv(TITANIC_TEST)

In [112]:
titanic_dataframe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Drop unnecessary columns (Columns that are categorical and qualitative)

In [113]:
ids = titanic_dataframe.PassengerId.values
survived = titanic_dataframe.Survived.values
test_ids = test_dataframe.PassengerId.values

titanic_dataframe.drop(["Name", "Sex", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True)
test_dataframe.drop(["Name", "Sex", "Ticket", "Cabin", "Embarked"], axis=1, inplace=True)


Fill empty cells in Age column.

In [114]:
average_age = np.mean(titanic_dataframe.Age)
test_age = np.mean(test_dataframe.Age)

titanic_dataframe.Age.fillna(average_age, inplace=True)
test_dataframe.Age.fillna(test_age, inplace=True)


titanic_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


Create new table with Age

In [115]:
age_table = titanic_dataframe.filter(["Age"])
test_age_table = test_dataframe.filter(["Age"])


Create new table with Fare

In [116]:
fare_table = titanic_dataframe.filter(["Fare"])
test_fare_table = test_dataframe.filter(["Fare"])

Basing initial Alg on Fare and Age

In [117]:
# n_clusters=2, max_iter=300 (default), n_int=10 (default)


Calculate KMeans with x-axis as age and y-axis as fare. Set clusters == 2

In [118]:
k = 2
x_axis = age_table.values
y_axis = fare_table.values

kmeans = KMeans(n_clusters=k)
results = kmeans.fit_predict(x_axis, y_axis)


test_x_axis = test_age_table.values
test_y_axis = test_fare_table.values
test_kmeans = KMeans(n_clusters=k)
test_results = test_kmeans.fit_predict(test_x_axis, test_y_axis)


In [119]:
true_positives = 0
false_positives = 0
true_negatives = 0
false_negatives = 0

for pass_id in ids:
    condition = titanic_dataframe[titanic_dataframe.PassengerId == pass_id].Survived.values[0]
    predicted = results[pass_id - 1]
    if not predicted:
        if not condition:
            true_positives += 1
        else:
            false_positives += 1
    else:
        if not condition:
            false_negatives += 1
        else:
            true_negatives += 1

print(true_positives, false_positives, true_negatives, false_negatives)
print(results)            
    

415 259 83 134
[0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0
 1 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1
 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 1 0 0

In [120]:
p = true_positives/(true_positives + false_positives)

r = true_positives/(true_positives + false_negatives)

F1 = 2*((p*r)/(p+r))
print(F1)

0.6786590351594439


In [121]:
import csv

output = [int(not bool(x)) for x in test_results]
predictions_file = open("kmean.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(test_ids, output))
predictions_file.close()