In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder as LE

In [2]:
df = pd.read_csv("../datasets/train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
selected_cols = ['Pclass','Survived', 'Sex', 'Age', 'SibSp','Parch',]

In [6]:
data = df.iloc?

In [None]:
data = df.iloc

In [7]:
data = df.iloc

In [8]:
data = df.loc[:,selected_cols]

In [9]:
data

Unnamed: 0,Pclass,Survived,Sex,Age,SibSp,Parch
0,3,0,male,22.0,1,0
1,1,1,female,38.0,1,0
2,3,1,female,26.0,0,0
3,1,1,female,35.0,1,0
4,3,0,male,35.0,0,0
5,3,0,male,,0,0
6,1,0,male,54.0,0,0
7,3,0,male,2.0,3,1
8,3,1,female,27.0,0,2
9,2,1,female,14.0,1,0


In [10]:
le = LE()

In [11]:
data["Sex"] = le.fit_transform(data.Sex)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Pclass      891 non-null int64
Survived    891 non-null int64
Sex         891 non-null int32
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int32(1), int64(4)
memory usage: 38.4 KB


In [13]:
data.Age.fillna?

In [14]:
data.Age.fillna(np.mean(data.Age),inplace=True)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Pclass      891 non-null int64
Survived    891 non-null int64
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int32(1), int64(4)
memory usage: 38.4 KB


In [16]:
le.classes_

array(['female', 'male'], dtype=object)

In [17]:
def entropy(column):
    vals,counts = np.unique(column,return_counts=True)
    
    acc = 0
    for count in counts:
        p = count/len(column)
        acc += p*np.log2(p)
        
    return -acc

In [18]:
entropy(data.Survived)

0.9607079018756469

In [19]:
def info_gain(X,y,label):
    pivot = np.mean(X[label])
    
    y_left  = y[X[label] < pivot]
    y_right = y[X[label] >= pivot]
    
    if (len(y_left) == 0) or (len(y_right) == 0):
        return -1000
    
    p_left= (len(y_left)/len(y))
    p_right = (len(y_right)/len(y))

    return entropy(y) - p_left*entropy(y_left) - p_right*entropy(y_right)
    

In [20]:
X = data.loc[:,['Pclass', 'Sex', 'Age', 'SibSp','Parch',]]
y = data["Survived"]

In [21]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
0,3,1,22.0,1,0
1,1,0,38.0,1,0
2,3,0,26.0,0,0
3,1,0,35.0,1,0
4,3,1,35.0,0,0


In [22]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [23]:
for label in X.columns:
    print(label,info_gain(X,y,label))

Pclass 0.07579362743608165
Sex 0.2176601066606143
Age 0.001158644038169343
SibSp 0.009584541813400127
Parch 0.015380754493137666


In [24]:
class Node:
    def __init__(self,label=None,value=None,result=None):
        self.label=label
        self.value=value
        self.result=result

In [40]:
class DecisionTree:
    def __init__(self,max_depth = 5):
        self.max_depth = max_depth
        
    
    def fit(self,X,y):
#         self.X=X
#         self.y=y
        
        self.root = self.generate(X,y,self.max_depth)
    
    def generate(self,X,y,depth):
        if depth == 1:
            return Node(result=np.mean(y))
        
        gains=[]
        
        for label in X.columns:
            gain = info_gain(X,y,label)
            gains.append((gain,label))
        selected_label = max(gains)[1]
        
        pivot = np.mean(X[selected_label])
        
        
        left_X =  X[X[selected_label] < pivot]
        right_X = X[X[selected_label] >= pivot]
        
        left_y  = y[X[selected_label] < pivot]
        right_y = y[X[selected_label] >= pivot]
        
        if (len(left_y)==0) or (len(right_y)==0):
            return Node(result = pivot)

        node = Node(selected_label,pivot)
        node.left = self.generate(left_X,left_y,depth-1)
        node.right = self.generate(right_X,right_y,depth-1)
        
        return node
        
    def display(self,node,indent=""):

        if(node.label==None):
            if(node.result<0.5):
                print(indent , "DIED")
            else:
                print(indent ,"Survived")
            return 
        
        print(indent,node.label,node.value)
        self.display(node.left,indent+"\t")
        self.display(node.right,indent+"\t")
        
    def predict_one(self,item,node):
        if node.label == None:
            return node.result
        pivot = node.value
        if(item[node.label] < pivot):
            return self.predict_one(item,node.left)
        
        else:
            return self.predict_one(item,node.right)
    
    def predict(self,X):
        pass
        
        

In [45]:
model = DecisionTree(max_depth=6)

In [46]:
model.fit(X,y)

In [47]:
model.display(model.root)

 Sex 0.6475869809203143
	 Pclass 2.159235668789809
		 Pclass 1.4470588235294117
			 Parch 0.4574468085106383
				 Age 35.46238970588235
					 Survived
					 Survived
				 Parch 1.4333333333333333
					 Survived
					 Survived
			 Parch 0.6052631578947368
				 Age 31.953294117647054
					 Survived
					 Survived
				 Age 24.096774193548388
					 Survived
					 Survived
		 Parch 0.7986111111111112
			 Age 26.090266435986152
				 SibSp 0.3076923076923077
					 Survived
					 DIED
				 Age 31.742883631713553
					 Survived
					 DIED
			 SibSp 1.694915254237288
				 Age 25.553934817170113
					 Survived
					 DIED
				 Parch 1.6363636363636365
					 DIED
					 DIED
	 Pclass 2.389948006932409
		 Pclass 1.4695652173913043
			 Age 39.28771697203473
				 Age 28.805818414322243
					 Survived
					 DIED
				 Age 52.93396226415094
					 DIED
					 DIED
			 Parch 0.2222222222222222
				 SibSp 0.2247191011235955
					 DIED
					 DIED
				 Age 18.57
					 Survived
					 DIED
		 Parch 0.224783861671

In [49]:
for i in range(10):
    print(model.predict_one(X.loc[i],model.root))

0.11267605633802817
0.9583333333333334
0.6071428571428571
1.0
0.06382978723404255
0.11864406779661017
0.13636363636363635
0.0
0.3333333333333333
0.9230769230769231


In [50]:
y[:10]

0    0
1    1
2    1
3    1
4    0
5    0
6    0
7    0
8    1
9    1
Name: Survived, dtype: int64