## Titanic Survivor Prediction Challenge

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_data = pd.read_csv("../Datasets/titanic/train.csv")
test_data = pd.read_csv("../Datasets/titanic/test.csv")

In [3]:
train_data.head()
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
columns_to_drop = ["PassengerId","Name","Ticket","Cabin","Embarked"]

train_clean_data = train_data.drop(columns_to_drop,axis=1)
test_clean_data = test_data.drop(columns_to_drop,axis=1)

In [7]:
train_clean_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.2500
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.9250
3,1,1,female,35.0,1,0,53.1000
4,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000
887,1,1,female,19.0,0,0,30.0000
888,0,3,female,,1,2,23.4500
889,1,1,male,26.0,0,0,30.0000


In [8]:
le = LabelEncoder()

In [9]:
train_clean_data['Sex'] = le.fit_transform(train_clean_data['Sex'])
test_clean_data['Sex'] = le.fit_transform(test_clean_data['Sex'])
train_clean_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.2500
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.9250
3,1,1,0,35.0,1,0,53.1000
4,0,3,1,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000
887,1,1,0,19.0,0,0,30.0000
888,0,3,0,,1,2,23.4500
889,1,1,1,26.0,0,0,30.0000


In [11]:
test_clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
Pclass    418 non-null int64
Sex       418 non-null int32
Age       332 non-null float64
SibSp     418 non-null int64
Parch     418 non-null int64
Fare      417 non-null float64
dtypes: float64(2), int32(1), int64(3)
memory usage: 18.1 KB


In [12]:
train_clean_data['Age'] = train_clean_data['Age'].fillna(train_clean_data['Age'].mean())
test_clean_data['Age'] = test_clean_data['Age'].fillna(test_clean_data['Age'].mean())

In [14]:
test_clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
Pclass    418 non-null int64
Sex       418 non-null int32
Age       418 non-null float64
SibSp     418 non-null int64
Parch     418 non-null int64
Fare      417 non-null float64
dtypes: float64(2), int32(1), int64(3)
memory usage: 18.1 KB


In [15]:
test_clean_data['Fare'] = test_clean_data['Fare'].fillna(test_clean_data['Fare'].mean())

In [16]:
input_cols = ["Pclass","Sex","Age","SibSp","Parch","Fare"]
output_cols = ["Survived"]

X = train_clean_data[input_cols]
Y = train_clean_data[output_cols]

In [17]:
X.shape

(891, 6)

In [18]:
Y.shape

(891, 1)

In [19]:
def entropy(col):
    
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p = ix/N
        
        ent += (-1.0 * p * np.log2(p))
        
    
    return ent

In [20]:
def divide_data(x_data,fkey,fval):
    
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        
        val = x_data[fkey].loc[ix]
        
        
        if val > fval:
            x_right =  x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
        
    return x_left,x_right

In [21]:
def information_gain(x_data,fkey,fval):
    
    left,right = divide_data(x_data,fkey,fval)
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1000000
    
    i_g = entropy(x_data.Survived) - (l * entropy(left.Survived) + r * entropy(right.Survived))
    
    return i_g

In [22]:
for fx in X.columns:
    print(fx)
    print(information_gain(train_clean_data,fx,train_clean_data[fx].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.0008836151229467681
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


In [34]:
class DecisionTree:
    
    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    def train(self,X_train):
        features = ['Pclass','Sex','Age','SibSp','Parch','Fare']
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = X_train[self.fkey].mean()
        
        #Split data
        data_left,data_right = divide_data(X_train,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        
        
        #Truly leaf Node
        if(data_left.shape[0] == 0 or data_right.shape[0] == 0):
            if X_train.Survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            
            return
        #Stop early when depth >= max depth
        if(self.depth >= self.max_depth):
            
            if X_train.Survived.mean() >= 0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            
            return
        
        #Recursive Case
        self.left = DecisionTree(depth=self.depth + 1, max_depth = self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1, max_depth=self.max_depth)
        self.right.train(data_right)
        
        #Set target value at each node
        if X_train.Survived.mean() >= 0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"
            
        return
            
    def predict(self,test):
        
        if test[self.fkey] > self.fval:
            
            if self.right is None:
                return self.target
            else:
                return self.right.predict(test)
        
        else:
            
            if self.right is None:
                return self.target
            else:
                return self.left.predict(test)
            
    
        

### Train-Validation-Test Split

In [35]:
train_clean_data.shape

(891, 7)

In [36]:
dt = DecisionTree()
dt.train(train_clean_data)

In [41]:
test_passenger_id = test_data['PassengerId'].values

In [43]:
test_passenger_id.shape

(418,)

In [44]:
test_clean_data.shape

(418, 6)

In [45]:
y_pred = []

for ix in range(test_clean_data.shape[0]):
    
    y_pred.append(dt.predict(test_clean_data.loc[ix]))

In [49]:
y_pred = le.fit_transform(y_pred)

In [53]:
test_passenger_id.shape

(418,)

In [55]:
len(y_pred)

418

In [56]:
import csv

In [63]:
f = open('submission.csv','w',newline='')

writer = csv.writer(f)

In [64]:
writer.writerow(['PassengerId','Survived'])

22

In [65]:
for i in range(len(y_pred)):
    writer.writerow([test_passenger_id[i],y_pred[i]])

In [66]:
f.close()