# Decision Tree

In [1]:
import pandas as pd
df = pd.read_csv('salaries.csv')
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [2]:
inputs = df.drop('salary_more_then_100k', axis='columns')
inputs.head()

Unnamed: 0,company,job,degree
0,google,sales executive,bachelors
1,google,sales executive,masters
2,google,business manager,bachelors
3,google,business manager,masters
4,google,computer programmer,bachelors


In [3]:
target = df.salary_more_then_100k
target

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: salary_more_then_100k, dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

In [6]:
inputs['company_n'] = le_company.fit_transform(df['company'])
inputs['job_n'] = le_job.fit_transform(df['job'])
inputs['degree_n'] = le_degree.fit_transform(df['degree'])
inputs.head()

Unnamed: 0,company,job,degree,company_n,job_n,degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0


In [7]:
inputs_n = inputs.drop(['company', 'job', 'degree'], axis='columns')
inputs_n

Unnamed: 0,company_n,job_n,degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0
5,2,1,1
6,0,2,1
7,0,1,0
8,0,0,0
9,0,0,1


#### Training the Classifier

In [8]:
from sklearn import tree

In [9]:
model = tree.DecisionTreeClassifier()
model.fit(inputs_n, target)

In [10]:
model.score(inputs_n, target)

1.0

In [11]:
model.predict([[2,1,0]])



array([0], dtype=int64)

## Now using the another dataset i.e. titanic dataset

In this file using following columns build a model to predict if person would survive or not,
- Pclass
- Sex
- Age
- Fare

Calculate score of the model

In [12]:
import pandas as pd
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'], axis='columns', inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [14]:
inputs = data.drop('Survived', axis='columns')
target = data.Survived
inputs.head()
# target

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [15]:
# Data Preprocessing
# inputs.Sex = inputs.Sex.map({'male': 'm', 'female':'f'})

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
inputs.Sex = le.fit_transform(inputs['Sex'])
inputs.Sex.head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int32

In [17]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,22.0,7.25
1,1,0,38.0,71.2833
2,3,0,26.0,7.925
3,1,0,35.0,53.1
4,3,1,35.0,8.05


In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2) 

In [19]:
x_train.head()
len(x_train)

712

In [20]:
x_test.head()
len(x_test)

179

In [21]:
from sklearn import tree
titanic_model = tree.DecisionTreeClassifier()

In [22]:
titanic_model.fit(x_train, y_train)

In [23]:
# Accuracy of the model
titanic_model.score(x_test, y_test)

0.7653631284916201

In [24]:
titanic_model.predict(x_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0], dtype=int64)

In [25]:
# This shows the accuracy of the model
y_test

833    0
662    0
529    0
330    1
464    0
      ..
585    1
647    1
615    1
104    0
468    0
Name: Survived, Length: 179, dtype: int64