# Using ID3 

In [48]:
# import the panda lib
import pandas as pd;

In [49]:
# read the data from files (training data)
df = pd.read_csv("../data/train.csv")

In [50]:
# check the col name
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
#check if null values are there
df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

## Age, Cabin, Embarked are having null values

In [52]:
#Fill null values as mean for age
df.Age = df.Age.fillna(df.Age.mean())

In [53]:
#Fill cabin with 1 if not null, otherwise 0
df.Cabin[df.Cabin.notnull()] = 1  #non null are 1 and nulls are 0
df.Cabin = df.Cabin.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [54]:
#Divide age into categories
#Grouping the data
bins = [0, 18, 25, 60, 100]
group_names = [1, 2, 3, 4]  #can take string but then we will again convert it into int 

categories = pd.cut(df['Age'], bins, labels=group_names)
df['categories'] = categories
df['AgeBinned'] = pd.cut(df['Age'], bins)  #having the range

In [55]:
#Convert all categories into integer
## Create map
Embarked_map = {'C': 1, 'S':2, 'Q':3}
Sex_map = {'male': 1, 'female':2, 'other':3}

In [56]:
##COnvert
df['Embarked'] = df['Embarked'].map(Embarked_map)
df['Sex'] = df['Sex'].map(Sex_map)

In [64]:
#df.head()
# Fill embark NaN values
df.Embarked = df.Embarked.fillna(df.Embarked.mean())

In [88]:
##Choose the columns to create the model
feature_col_names = ['PassengerId', 'Pclass', 'Sex', 'categories', 'SibSp', 'Parch', 'Embarked']
output_col_name = 'Survived'
X = df[feature_col_names].values
Y = df[output_col_name].values

In [89]:
##Divide the training data 
from sklearn.cross_validation import train_test_split
split_test_size = 0.32

[train_in, test_in, train_out, test_out]= train_test_split(X, Y, test_size=split_test_size, random_state=62)


In [101]:
#Import the ID3
from sklearn.tree import DecisionTreeClassifier

Dtree = DecisionTreeClassifier(min_samples_split=32, random_state=62)

In [102]:
Dtree.fit(train_in, train_out)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=32,
            min_weight_fraction_leaf=0.0, presort=False, random_state=62,
            splitter='best')

In [103]:
# predict from divided testing set
result = Dtree.predict(test_in)

In [104]:
# check the accuracy
from sklearn import metrics

#Check accuracy
print("Accuracy: {0:4f}".format(metrics.accuracy_score(test_out, result)))

Accuracy: 0.825175


In [94]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(test_out, result)))

print("Classficiation report")
print(metrics.classification_report(test_out, result))

Confusion Matrix
[[168  16]
 [ 34  68]]
Classficiation report
             precision    recall  f1-score   support

          0       0.83      0.91      0.87       184
          1       0.81      0.67      0.73       102

avg / total       0.82      0.83      0.82       286



In [105]:
## export graphviz for graph
from sklearn.tree import export_graphviz
export_graphviz(Dtree, out_file="tree.dot", feature_names=feature_col_names)

In [96]:
command = ["dot", "-Tpng", "tree.dot", "-o", "tree.png"]

In [87]:
##subprocess.check_call(command) run in command line

NameError: name 'subprocess' is not defined