# Data Preprocessing

In [123]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB


In [5]:
columns_to_drop=["PassengerId","Name","Ticket","Cabin","Embarked"]

In [6]:
data_clean=data.drop(columns_to_drop,axis=1)

In [7]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [8]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 45.3+ KB


In [10]:
#now we will convert male and female to numeric values
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

data_clean["Sex"]=le.fit_transform(data_clean["Sex"])

In [11]:
#now we will the nan values of age with mean value of age
data_clean=data_clean.fillna(data_clean["Age"].mean())

In [12]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [13]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.3 KB


In [17]:
data_clean.loc[2]

Survived     1.000
Pclass       3.000
Sex          0.000
Age         26.000
SibSp        0.000
Parch        0.000
Fare         7.925
Name: 2, dtype: float64

In [18]:
input_col=["Pclass","Sex","Age","SibSp","Parch","Fare"]
output_col=["Survived"]

X=data_clean[input_col]
Y=data_clean[output_col]

In [19]:
print(X.shape,Y.shape)

(891, 6) (891, 1)


# Define entropy and information gain

In [20]:
def entropy(col):
    counts=np.unique(col,return_counts=True)
    n=float(col.shape[0])
    ent=0.0
    
    for ix in counts[1]:
        p=ix/n
        ent+=(-1.0*p*np.log2(p))
        
    return ent

In [21]:
col=np.array([1,0,1,1,1,0,1])
entropy(col)

0.863120568566631

In [23]:
def divide_data(x_data,fkey,fval):
    x_right=pd.DataFrame([],columns=x_data.columns)
    x_left=pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val=x_data[fkey].loc[ix]
        if val>fval:
            x_right=x_right.append(x_data.loc[ix])
        else:
            x_left=x_left.append(x_data.loc[ix])
            
    return x_left,x_right

In [26]:
def information_gain(x_data,fkey,fval):
    left,right=divide_data(x_data,fkey,fval)
    
    l=float(left.shape[0])/x_data.shape[0]
    r=float(right.shape[0])/x_data.shape[0]
    
    if left.shape[0]==0 or right.shape[0]==0:
        return -1000000
    
    i_gain=entropy(x_data.Survived)-(l*entropy(left.Survived)+r*entropy(right.Survived))
    return i_gain

In [27]:
for fx in X.columns:
    print(fx)
    print(information_gain(data_clean,fx,data_clean[fx].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.0008836151229467681
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


## Implementing Decision Tree

In [55]:
class DecisionTree:
    
    #constructor
    def __init__(self,depth=0,max_depth=5):
        self.left=None
        self.right=None
        self.fkey=None
        self.fval=None
        self.depth=depth
        self.max_depth=max_depth
        self.target=None
        
    def train(self,xtrain):
        features=["Pclass","Sex","Age","SibSp","Parch","Fare"]
        info_gains=[]
        
        for ix in features:
            i_gain=information_gain(xtrain,ix,xtrain[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey=features[np.argmax(info_gains)]
        self.fval=xtrain[self.fkey].mean()
        print("making tree at feature",self.fkey)
        
        #split data
        ldata,rdata=divide_data(xtrain,self.fkey,self.fval)
        ldata=ldata.reset_index(drop=True)
        rdata=rdata.reset_index(drop=True)
        
        # truly a left node
        if ldata.shape[0]==0 or rdata.shape[0]==0:
            if xtrain.Survived.mean()>=0.5:
                self.target="Survived"
            else:
                self.target="Dead"
            return
        
        # max depth reached
        if self.depth>=self.max_depth:
            if xtrain.Survived.mean()>=0.5:
                self.target="Survived"
            else:
                self.target="Dead"
            return
        
        # recursive case
        self.left=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(ldata)
        
        self.right=DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(rdata)
        
        # set the target at every node
        if xtrain.Survived.mean()>=0.5:
            self.target="Survived"
        else:
            self.target="Dead"
        return
    
    def predict(self,test):
        if test[self.fkey]>=self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
            
            
        
        

## Train-Validation-Test Set Split

In [56]:
split=int(0.7*data_clean.shape[0])
train_data=data_clean[:split]
test_data=data_clean[split:]
test_data=test_data.reset_index(drop=True)

In [57]:
print(train_data.shape,test_data.shape)

(623, 7) (268, 7)


In [58]:
dt=DecisionTree()
dt.train(train_data)

making tree at feature Sex
making tree at feature Pclass
making tree at feature Age
making tree at feature SibSp
making tree at feature Pclass
making tree at feature Age
making tree at feature Age
making tree at feature SibSp
making tree at feature Parch
making tree at feature Pclass
making tree at feature SibSp
making tree at feature Fare
making tree at feature Parch
making tree at feature Age
making tree at feature Pclass
making tree at feature Age
making tree at feature Age
making tree at feature Parch
making tree at feature SibSp
making tree at feature Fare
making tree at feature Age
making tree at feature Age
making tree at feature Fare
making tree at feature Age
making tree at feature Age
making tree at feature Fare
making tree at feature Age
making tree at feature Parch
making tree at feature Fare
making tree at feature Fare
making tree at feature Fare
making tree at feature Age
making tree at feature Fare
making tree at feature Parch
making tree at feature Fare
making tree at f

In [59]:
print(dt.fkey)
print(dt.fval)
print(dt.left.fkey)
print(dt.right.fkey)

Sex
0.6292134831460674
Pclass
Fare


In [67]:
y_pred=[]
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

In [68]:
y_pred

['Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Survived',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Dead',
 'Survived',
 'Survived',
 'Dead',
 'Dead',
 'Survived',
 'Dead',
 'Dead',


In [69]:
y_actual=test_data[output_col]

In [70]:
le=LabelEncoder()

In [71]:
y_pred=le.fit_transform(y_pred)

In [72]:
y_pred=np.array(y_pred).reshape((-1,1))

In [73]:
print(y_pred.shape)
print(y_actual.shape)

(268, 1)
(268, 1)


In [74]:
acc=np.sum(np.array(y_pred)==np.array(y_actual))/y_pred.shape[0]

In [75]:
print(acc)

0.8171641791044776


## Decision Tree using sklearn

In [76]:
from sklearn.tree import DecisionTreeClassifier

In [103]:
sk_tree=DecisionTreeClassifier(criterion="entropy",max_depth=5)

In [104]:
sk_tree.fit(train_data[input_col],train_data[output_col])

DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [105]:
sk_tree.predict(test_data[input_col])

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0], dtype=int64)

In [106]:
sk_tree.score(test_data[input_col],test_data[output_col])

0.8283582089552238

## Visualise Decision Tree

In [120]:
import pydotplus
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz

In [121]:
dot_data=StringIO()
export_graphviz(sk_tree,out_file=dot_data,filled=True,rounded=True)

In [122]:
graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

InvocationException: GraphViz's executables not found

## Random Forests

In [124]:
x_train=train_data[input_col]
x_test=test_data[input_col]
y_train=np.array(train_data[output_col]).reshape((-1,))
y_test=np.array(test_data[output_col]).reshape((-1,))

In [128]:
from sklearn.ensemble import RandomForestClassifier

In [130]:
rf=RandomForestClassifier(n_estimators=10,criterion="entropy",max_depth=5)

In [131]:
rf.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=10)

In [132]:
rf.score(x_train,y_train)

0.8394863563402889

In [133]:
rf.score(x_test,y_test)

0.8395522388059702

In [134]:
from sklearn.model_selection import cross_val_score

In [135]:
acc=cross_val_score(RandomForestClassifier(n_estimators=10,criterion="entropy",max_depth=5),x_train,y_train)

In [137]:
acc=acc.mean()

In [138]:
print(acc)

0.8074064516129033
