# Decison Trees

First we'll load some fake data on past hires I made up. Note how we use pandas to convert a csv file into a DataFrame:

In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
import re

input_file = "C:/datacamp/train.csv"
df = pd.read_csv(input_file, header = 0)

dictionary = {'A':1, 'B': 2, 'C':3, 'D':4, 'E':5, 'F':6, 'G': 7, 'T':8}

def num_ticket(string):
    for k,v in dictionary.items():
        if k in string:
            return v

def master_age(string):
    if "Master" in string:
        return 1

def getTicketPrefix(ticket):
    match = re.compile("([a-zA-Z./]+)").search(ticket)
    if match:
        return match.group()
    else:
        return 'U'

In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df['TicketPrefix'] = df['Ticket'].map( lambda x : getTicketPrefix(x.upper()))
df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('[.?/?]', '', x) )
df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('STON', 'SOTON', x) )

deduped_ticket_prefix = list(set(df['TicketPrefix']))
ticket_prefix_id = [i for i in range(len(deduped_ticket_prefix))]
ticket_prefix_mapped_to_id = dict(zip(deduped_ticket_prefix,ticket_prefix_id))
df['ticket_prefix_id'] = df['TicketPrefix'].map(ticket_prefix_mapped_to_id)
df = df.drop('TicketPrefix',1)

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_prefix_id
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,14
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,26
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,15
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,13
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,13


In [5]:
family_name, given_name = df['Name'].str.split(',', 1).str
df['family_name'] = family_name
deduped_fam_name = list(set(family_name))
fam_id = [i for i in range(len(deduped_fam_name))]
fam_name_mapped_to_id = dict(zip(deduped_fam_name,fam_id))
gender_num = {'male': 1, 'female': 0}
df['Embarked'] = df['Embarked'].fillna('S')
embarked_num = {'C': 1, 'Q':2,'S':3}
df['family_name'] = df['family_name'].map(fam_name_mapped_to_id)
df['Sex'] = df['Sex'].map(gender_num)
df['Embarked'] = df['Embarked'].map(embarked_num)
df['Cabin'] = df['Cabin'].astype(str)
df['Cabin'] = df['Cabin'].apply(num_ticket)
df['Cabin'] = df['Cabin'].fillna('9')
deduped_ticket_num = list(set(df['Ticket']))
ticket_group_id = [i for i in range(len(deduped_ticket_num))]
ticket_mapped_to_group_id = dict(zip(deduped_ticket_num,ticket_group_id))
df['ticket_group_id'] = df['Ticket'].map(ticket_mapped_to_group_id)
df = df.drop('Ticket',1)
df['Age'] = np.where(df['Name'].str.contains('Master'), 8, df['Age'])
df = df.drop('Name',1)
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(np.floor(df['Fare']).median())

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,ticket_prefix_id,family_name,ticket_group_id
0,1,0,3,1,22.0,1,0,7.25,9,3,14,112,547
1,2,1,1,0,38.0,1,0,71.2833,3,1,26,124,220
2,3,1,3,0,26.0,0,0,7.925,9,3,15,636,354
3,4,1,1,0,35.0,1,0,53.1,3,3,13,555,246
4,5,0,3,1,35.0,0,0,8.05,9,3,13,181,567


In [7]:
df.to_csv('prepped_train_data.csv')

scikit-learn needs everything to be numerical for decision trees to work. So, we'll map Y,N to 1,0 and levels of education to some scale of 0-2. In the real world, you'd need to think about how to deal with unexpected or missing data! By using map(), we know we'll get NaN for unexpected values.

Next we need to separate the features from the target column that we're trying to bulid a decision tree for.

In [8]:
features = list(df.columns[2:13])
features

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Cabin',
 'Embarked',
 'ticket_prefix_id',
 'family_name',
 'ticket_group_id']

Now actually construct the decision tree:

In [9]:
y = df["Survived"]
X = df[features]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,y)

... and display it. Note you need to have pydotplus installed for this to work. (!pip install pydotplus)

To read this decision tree, each condition branches left for "true" and right for "false". When you end up at a value, the value array represents how many samples exist in each target value. So value = [0. 5.] mean there are 0 "no hires" and 5 "hires" by the tim we get to that point. value = [3. 0.] means 3 no-hires and 0 hires.

In [10]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydot 

dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=features)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())  

ModuleNotFoundError: No module named 'pydot'

## Ensemble learning: using a random forest

We'll use a random forest of 10 decision trees to predict employment of specific candidate profiles:

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50)
clf = clf.fit(X, y)

input_file = "C:/datacamp/test.csv"
df_test = pd.read_csv(input_file, header = 0)

df_test.head()

#Predict employment of an employed 10-year veteran
#print (clf.predict([[10, 1, 4, 0, 0, 0]]))
#...and an unemployed 10-year veteran
#print (clf.predict([[10, 0, 4, 0, 0, 0]]))

In [None]:
df_test['TicketPrefix'] = df_test['Ticket'].map( lambda x : getTicketPrefix(x.upper()))
df_test['TicketPrefix'] = df_test['TicketPrefix'].map( lambda x: re.sub('[.?/?]', '', x) )
df_test['TicketPrefix'] = df_test['TicketPrefix'].map( lambda x: re.sub('STON', 'SOTON', x) )

deduped_ticket_prefix = list(set(df_test['TicketPrefix']))
ticket_prefix_id = [i for i in range(len(deduped_ticket_prefix))]
ticket_prefix_mapped_to_id = dict(zip(deduped_ticket_prefix,ticket_prefix_id))
df_test['ticket_prefix_id'] = df_test['TicketPrefix'].map(ticket_prefix_mapped_to_id)
df_test = df_test.drop('TicketPrefix',1)

In [None]:
df_test.head()

In [None]:
family_name, given_name = df_test['Name'].str.split(',', 1).str
df_test['family_name'] = family_name
deduped_fam_name = list(set(family_name))
fam_id = [i for i in range(len(deduped_fam_name))]
fam_name_mapped_to_id = dict(zip(deduped_fam_name,fam_id))
gender_num = {'male': 1, 'female': 0}
embarked_num = {'C': 1, 'Q':2,'S':3}
df_test['family_name'] = df_test['family_name'].map(fam_name_mapped_to_id)
df_test['Sex'] = df_test['Sex'].map(gender_num)
df_test['Embarked'] = df_test['Embarked'].map(embarked_num)
df_test['Cabin'] = df_test['Cabin'].astype(str)
df_test['Cabin'] = df_test['Cabin'].apply(num_ticket)
df_test['Cabin'] = df_test['Cabin'].fillna('9')
deduped_ticket_num = list(set(df_test['Ticket']))
ticket_group_id = [i for i in range(len(deduped_ticket_num))]
ticket_mapped_to_group_id = dict(zip(deduped_ticket_num,ticket_group_id))
df_test['ticket_group_id'] = df_test['Ticket'].map(ticket_mapped_to_group_id)
df_test = df_test.drop('Ticket',1)
df_test['Age'] = np.where(df_test['Name'].str.contains('Master'), 8, df_test['Age'])
df_test = df_test.drop('Name',1)
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean())
df_test['Fare'] = df_test['Fare'].fillna(np.floor(df['Fare']).median())

In [None]:
df_test.head()

In [None]:
test_features = list(df_test.columns[1:12])
df_test_input = df_test[test_features]
df_test_input.to_csv('test_input.csv')
df_test_input.head()

## Activity

Modify the test data to create an alternate universe where everyone I hire everyone I normally wouldn't have, and vice versa. Compare the resulting decision tree to the one from the original data.

In [None]:
df_test['Survived'] = clf.predict(df_test_input)

In [None]:
df_test.head(10)

In [None]:
df_test.to_csv('results.csv')