# Decision Tree algorithm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# read input file
df = pd.read_csv("salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [3]:
X = df.drop("salary_more_then_100k", axis = 1)
X

Unnamed: 0,company,job,degree
0,google,sales executive,bachelors
1,google,sales executive,masters
2,google,business manager,bachelors
3,google,business manager,masters
4,google,computer programmer,bachelors
5,google,computer programmer,masters
6,abc pharma,sales executive,masters
7,abc pharma,computer programmer,bachelors
8,abc pharma,business manager,bachelors
9,abc pharma,business manager,masters


In [4]:
y = df.salary_more_then_100k
y

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: salary_more_then_100k, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder


In [6]:
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

In [7]:
X['company_n'] = le_company.fit_transform(X['company'])
X['job_n'] = le_job.fit_transform(X['job'])
X['degree_n'] = le_degree.fit_transform(X['degree'])
X.head()

Unnamed: 0,company,job,degree,company_n,job_n,degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0


In [8]:
X_final = X.drop(['company', 'job', 'degree'], axis = 1)
X_final

Unnamed: 0,company_n,job_n,degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0
5,2,1,1
6,0,2,1
7,0,1,0
8,0,0,0
9,0,0,1


In [9]:
from sklearn import tree

In [10]:
model = tree.DecisionTreeClassifier()

In [11]:
model.fit(X_final, y)

DecisionTreeClassifier()

In [12]:
model.score(X_final, y)

1.0

In [13]:
model.predict([[2,2,0]])

array([0], dtype=int64)

In [14]:
model.predict([[0,0,1]])

array([1], dtype=int64)

# Exercise - Titanic passenger servival 

In [15]:
# Read input file 
df_titanic = pd.read_csv("titanic.csv")
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [16]:
# columns - passengerid, name, sibsp, Parch, Ticket cabin , Embarked are not going to be used in the model study 
# create a input data with columns Pclass, Sex, Age, Fare - Independant variables 
# create a target data with column Survived - dependent variable 
Input = df_titanic.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 'columns')
Input

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,,23.4500
889,1,male,26.0,30.0000


In [17]:
# missing_data = Input.isnull()
# missing_data.head(5)
Input.isnull().sum()

# Column age only contains NaN values , 177 which is significant in the total rows, 
# hence , instead of deleting rows, replace NaN values with mean of age

Pclass      0
Sex         0
Age       177
Fare        0
dtype: int64

In [18]:
# simply drop whole row with NaN in "Age" column
Input = Input.fillna(int(Input[['Age']].mean()))
Input



Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,29.0,23.4500
889,1,male,26.0,30.0000


In [19]:
# check for any NaN values in the dataframe
Input.isnull().sum()

Pclass    0
Sex       0
Age       0
Fare      0
dtype: int64

In [20]:
Input.shape

(891, 4)

In [21]:
Target = df_titanic.Survived
Target

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [22]:
Target.isnull().sum()

0

In [23]:
le_sex = LabelEncoder()

In [24]:
Input['Sex_n'] = le_sex.fit_transform(Input['Sex'])

In [25]:
Input.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Sex_n
0,3,male,22.0,7.25,1
1,1,female,38.0,71.2833,0
2,3,female,26.0,7.925,0
3,1,female,35.0,53.1,0
4,3,male,35.0,8.05,1


In [26]:
X_final = Input.drop(['Sex'], axis = 'columns')
X_final

Unnamed: 0,Pclass,Age,Fare,Sex_n
0,3,22.0,7.2500,1
1,1,38.0,71.2833,0
2,3,26.0,7.9250,0
3,1,35.0,53.1000,0
4,3,35.0,8.0500,1
...,...,...,...,...
886,2,27.0,13.0000,1
887,1,19.0,30.0000,0
888,3,29.0,23.4500,0
889,1,26.0,30.0000,1


In [27]:
# train test split model
from sklearn.model_selection import train_test_split
X_final_train, X_final_test, y_train, y_test = train_test_split(X_final, Target, test_size = 0.2)

In [28]:
len(X_final_train)

712

In [29]:
len(X_final_test)

179

In [30]:
from sklearn import tree
# model_titanic = tree.DecisionTreeClassifier()
model_titanic = tree.DecisionTreeClassifier()

In [32]:
model_titanic.fit(X_final_train, y_train)

DecisionTreeClassifier()

In [33]:
model_titanic.score(X_final_train, y_train)

0.976123595505618

In [34]:
model_titanic.predict(X_final_test)

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0], dtype=int64)

In [35]:
model_titanic.predict_proba(X_final_test)

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.