In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Preprocessing

Here we will be predicting whether a person survived or not on the basis of pclass, sex, age and fare.
So, we will take only those as inputs and survived as output

In [3]:
inputs = df[['Pclass', 'Sex', 'Age', 'Fare']].copy()
target = df['Survived']

In [4]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


Here, 'Sex' attribute is in categorical form, which can't be used in model, thus converting it in numerical form using Label Encoder

In [5]:
from sklearn.preprocessing import LabelEncoder

le_sex = LabelEncoder()
inputs['sex_n'] = le_sex.fit_transform(inputs['Sex'])
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,sex_n
0,3,male,22.0,7.25,1
1,1,female,38.0,71.2833,0
2,3,female,26.0,7.925,0
3,1,female,35.0,53.1,0
4,3,male,35.0,8.05,1


In [6]:
inputs.drop(['Sex'], axis = 'columns', inplace = True)

In [7]:
inputs.head()

Unnamed: 0,Pclass,Age,Fare,sex_n
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


In [8]:
inputs['Pclass'].isnull().values.any()

False

In [9]:
inputs['Age'].isnull().values.any()

True

In [10]:
inputs['Fare'].isnull().values.any()

False

In [11]:
inputs['sex_n'].isnull().values.any()

False

Here Age is not defined somewhere in the data, as a result of which it is NaN and it will not be accepted by the model,
thus taking mean of all ages and filling the NaN places

In [12]:
mean_age = inputs.Age.mean()
inputs.Age.fillna(mean_age, inplace = True)

In [13]:
inputs

Unnamed: 0,Pclass,Age,Fare,sex_n
0,3,22.000000,7.2500,1
1,1,38.000000,71.2833,0
2,3,26.000000,7.9250,0
3,1,35.000000,53.1000,0
4,3,35.000000,8.0500,1
...,...,...,...,...
886,2,27.000000,13.0000,1
887,1,19.000000,30.0000,0
888,3,29.699118,23.4500,0
889,1,26.000000,30.0000,1


In [14]:
inputs['Age'].isnull().values.any()

False

In [15]:
from sklearn.model_selection import train_test_split

(x_train, x_test, y_train, y_test) = train_test_split(inputs, target, train_size = 0.8, random_state = 18)

### Choosing Decision Tree Classifier Model

In [16]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [17]:
model.fit(x_train, y_train)

DecisionTreeClassifier()

In [18]:
model.score(x_test, y_test)

0.8100558659217877