## Decision Tree

* Decision tree is a tree structure algorithm.
* It is also known as CART(Classification and regression tree).

* Uses Decisions( Rules ) to classify data.
* Uses ***Entropy Value*** to take decisions

In [40]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [41]:
iris = load_iris()
iris_df = pd.DataFrame(iris.data,columns = iris.feature_names)
iris_df['Target'] = iris.target

iris_x = iris_df.iloc[:,:-1]
iris_y = iris_df['Target']

In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier()

In [43]:
x_train,x_test,y_train,y_test = train_test_split(iris_x,iris_y,train_size = 0.7,random_state=42)



In [44]:
dtc.fit(x_train,y_train)
y_test_prediction = dtc.predict(x_test)
print("Score = ",dtc.score(x_train,y_train))
print("Accuracy Score = ",accuracy_score(y_test,y_test_prediction))

Score =  1.0
Accuracy Score =  1.0


In [45]:
from sklearn.metrics import classification_report

print("Classification Report :\n\n",classification_report(y_test,y_test_prediction))

Classification Report :

               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

   micro avg       1.00      1.00      1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [46]:
from sklearn.metrics import confusion_matrix

print("Confusion Matrix :\n\n",confusion_matrix(y_test,y_test_prediction))

Confusion Matrix :

 [[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]


In [63]:
import matplotlib.pyplot as plt
from sklearn import tree

# tree.plot_tree(dtc)

## Decision tree regression

#### Diabetes Dataset

In [75]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
diabetes_df = pd.DataFrame(diabetes.data,columns = diabetes.feature_names)
diabetes_df['Target'] = diabetes.target

diabetes_x = diabetes_df.iloc[:,:-1]
diabetes_y = diabetes_df['Target']

### Splitting Data into Training and Testing sets

In [76]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(diabetes_x,diabetes_y,test_size=0.3,random_state=42)

### Fitting Decision Tree Model

In [77]:
from sklearn.tree import DecisionTreeRegressor

dtc = DecisionTreeRegressor()
dtc.fit(x_train,y_train)
print("Accuracy Score = ",accuracy_score(y_test,dtc.predict(x_test))*100)
print("Score = ",dtc.score(x_train,y_train)*100)

Accuracy Score =  0.0
Score =  100.0


In [89]:
print(dtc.predict(x_test))

[198. 221. 196. 310. 139. 230. 237. 246. 124. 128. 158. 152.  85. 277.
  88. 210. 243. 279. 208. 150. 147. 183.  50. 150.  85. 131. 288.  91.
  59.  89. 128.  50. 172. 292. 192. 180.  93.  89. 185.  96.  39. 170.
 142.  74. 200.  75.  31.  65.  39. 161. 142.  72. 142. 179. 229. 142.
  71. 288.  53.  63. 131. 265. 142. 148.  96. 265. 248. 170. 150. 124.
 161. 198. 292. 200.  96.  66. 150. 128. 178. 248.  49.  88.  53.  81.
  77.  52. 143.  55. 145. 143.  53. 296.  43.  63.  96. 202. 242.  78.
 144.  75. 202. 178. 341. 142. 196. 150. 146. 146. 225. 265.  97. 212.
 209. 268. 144. 124.  83. 200. 143. 128. 235. 183.  78. 235.  47. 198.
 225. 199. 129. 185. 230.  65. 208.]


## Actual vs Predicted Values

In [87]:
act_pred = pd.DataFrame({'Actual':y_test,'Predicted':dtc.predict(x_test)})
act_pred.sample(3)

Unnamed: 0,Actual,Predicted
78,252.0,152.0
227,108.0,170.0
433,72.0,81.0


### Evaluating Model

In [86]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

mse = mean_squared_error(y_test,dtc.predict(x_test))
mae = mean_absolute_error(y_test,dtc.predict(x_test))
rms = np.sqrt(mse)
print(mse,"\n",mae,"\n",rms)

5611.691729323308 
 58.6390977443609 
 74.91122565626135


### Label Encode

In [92]:
d = {'name':['A','B','C','D','E'],'gender':['F','M','F','M','F']}
df = pd.DataFrame(d)

In [96]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df

Unnamed: 0,name,gender
0,A,0
1,B,1
2,C,0
3,D,1
4,E,0


In [115]:
from sklearn.datasets import load_iris
iris  = load_iris()
df = pd.DataFrame(iris.data,columns = iris.feature_names)
df['Target'] = iris.target
df['Names'] = df['Target'].replace([0,1,2],iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Target,Names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
