In [2]:
import pandas as pd
from sklearn import tree
from treeviz import tree_print
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("leaftemp.all.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,glasshouse,CO2level,day,light,CO2,tempDiff,BtempDiff,airTemp,vapPress
0,1,A,high,0,1143.06,127.112222,1.835037,1.496457,26.047167,2.563877
1,2,B,low,0,1143.06,39.854833,1.542604,1.355216,24.10275,1.878973
2,3,C,medium,0,1143.06,70.938111,1.962119,1.94278,26.711556,2.37594
3,4,A,high,1,465.810556,128.404444,0.928792,0.849522,25.135167,2.546759
4,5,B,low,1,465.810556,40.996556,0.680829,0.59922,24.899306,2.198131


In [7]:
features_df = df.drop(['glasshouse','CO2level'],axis=1)
features_df.head()

Unnamed: 0.1,Unnamed: 0,day,light,CO2,tempDiff,BtempDiff,airTemp,vapPress
0,1,0,1143.06,127.112222,1.835037,1.496457,26.047167,2.563877
1,2,0,1143.06,39.854833,1.542604,1.355216,24.10275,1.878973
2,3,0,1143.06,70.938111,1.962119,1.94278,26.711556,2.37594
3,4,1,465.810556,128.404444,0.928792,0.849522,25.135167,2.546759
4,5,1,465.810556,40.996556,0.680829,0.59922,24.899306,2.198131


In [8]:
target_df = pd.DataFrame(df['CO2level'])
target_df.head()

Unnamed: 0,CO2level
0,high
1,low
2,medium
3,high
4,low


Setting up the model.

In [11]:
dtree = tree.DecisionTreeClassifier(criterion='entropy')

Build the model:

In [12]:
dtree.fit(features_df,target_df)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Show the actual model. Decision trees are transparent models so we can just look at them:

In [13]:
tree_print(dtree,features_df)

if CO2 =< 90.73361206054688: 
  |then if CO2 =< 52.25672149658203: 
  |  |then low
  |  |else medium
  |else high
<---->
Tree Depth:  2


A correct model is a model where the predicted labels equal the original training labels

In [19]:
predict_array = dtree.predict(features_df)      # produces an array of labels
predicted_labels = pd.DataFrame(predict_array)  # turn it into a DF
predicted_labels.columns = ['CO2level']          # name the column - same name as in target!

In [20]:
predicted_labels.head()

Unnamed: 0,CO2level
0,high
1,low
2,medium
3,high
4,low


In [21]:
target_df.head()

Unnamed: 0,CO2level
0,high
1,low
2,medium
3,high
4,low


In [22]:
predicted_labels.equals(target_df)

True

Model Accuracy

In [23]:
from sklearn.metrics import accuracy_score

print("Our model accuracy is: {}".format(accuracy_score(target_df, predicted_labels)))

Our model accuracy is: 1.0


another model where we restrict the complexity

In [27]:
dtree2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=1)
dtree2.fit(features_df,target_df)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [28]:
tree_print(dtree2,features_df)

if CO2 =< 90.73361206054688: 
  |then medium
  |else high
<->
Tree Depth:  1


In [29]:
predict_array2 = dtree2.predict(features_df)      # produces an array of labels
predicted_labels2 = pd.DataFrame(predict_array2)  # turn it into a DF
predicted_labels2.columns = ['Species']           # name the column - same name as in target!

print("Our model accuracy is: {}".format(accuracy_score(target_df, predicted_labels2)))

Our model accuracy is: 0.6774193548387096


Observation: by restricting the complexity of the model we often obtain very readable and understandable models without sacrificing a lot of accuracy!