In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')
print(data.shape)
data.head(10)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Also, we can see there are columns inside the dataset where there are NaN entries or missing values like 'Age','Cabin','Embarked'. So, we need to apply data preprocessing to get rid of these features.

### Data Preprocessing

 - Dropping not useful columns/features
 - Filling the missing values (Data Imputation)

In [4]:
# columns that can be dropped
drop_columns = ["PassengerId","Name","Ticket","Cabin","Embarked"]

In [5]:
cleaned_data = data.drop(drop_columns,axis=1) # dropped the features along the columns i.e., axis=1
cleaned_data.shape

(891, 7)

cleaned_data.head(10)

Now, we can see we are left with less number of features that can be more relevant for making predictions.
Also, 'Sex' column in the dataset contains string values which we need to convert into numerical data. So, we will use LabelEncoder to encode into numerical data. 

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

In [8]:
cleaned_data["Sex"]=le.fit_transform(cleaned_data["Sex"])

In [9]:
cleaned_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05
5,0,3,1,,0,0,8.4583
6,0,1,1,54.0,0,0,51.8625
7,0,3,1,2.0,3,1,21.075
8,1,3,0,27.0,0,2,11.1333
9,1,2,0,14.0,1,0,30.0708


Although, we have converted string into numerical type. It can still be observed that 'Age' column still has a lot of NaN values, so we need to fill those centers with the mean value of the age column. 

In [10]:
cleaned_data = cleaned_data.fillna(cleaned_data["Age"].mean())
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


We can now notice 'Age' column now also has 891 values.

###### Dividing our dataset into X (features) and Y (labels)

In [11]:
X = cleaned_data.iloc[:,1:]
Y = cleaned_data.iloc[:,0]
print(X.shape)
print(Y.shape)

(891, 6)
(891,)


Also Splitting data for testing purpose

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_val,Y_train,Y_val = train_test_split(X,Y,test_size=0.20)

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [44]:
dt = DecisionTreeClassifier(criterion='entropy',max_depth=4)

In [45]:
dt.fit(X_train,Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [48]:
predictions = dt.predict(X_val)

In [49]:
dt.score(X_val,Y_val)

0.7932960893854749

### Visualising the Decision Tree Using Graphviz Library

In [58]:
import pydotplus
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz

In [59]:
dot_data = StringIO()
export_graphviz(dt,out_file=dot_data,filled=True,rounded=True)

In [60]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

InvocationException: Program terminated with status: 1. stderr follows: 'C:\Users\SRISHTI' is not recognized as an internal or external command,
operable program or batch file.
