In [384]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.tree import DecisionTreeClassifier

In [385]:

def entropy_func(c, n):
    return -(c*1.0/n)*math.log(c*1.0/n, 2)

def entropy_cal(c1, c2):
    #entropy between class 1 and 2
    if c1== 0 or c2 == 0:  # when there is only one class in the group, entropy is 0
        return 0
    return entropy_func(c1, c1+c2) + entropy_func(c2, c1+c2)

# One versus All
# c1,c2,c3, .., cm
# c1, *
# c2, *
# c3, *
# ...
# cm, *

#each class versus the others
def entropy_of_one_division(division): 
    s = 0
    n = len(division)
    classes = set(division)
    for c in classes:   # for each class, get entropy
        n_c = sum(division==c)
        e = n_c*1.0/n * entropy_cal(sum(division==c), sum(division!=c)) # weighted avg
        s += e
    return s, n

# The whole entropy
def get_entropy(y_predict, y_real):
    if len(y_predict) != len(y_real):
        print('They have to be the same length')
        return None
    n = len(y_real)
    s_true, n_true = entropy_of_one_division(y_real[y_predict]) # left hand side entropy
    s_false, n_false = entropy_of_one_division(y_real[~y_predict]) # right hand side entropy
    s = n_true*1.0/n * s_true + n_false*1.0/n * s_false # overall entropy, again weighted average
    return s

In [386]:
class DecisionTreeClassifier(object):
    def __init__(self, max_depth):
        self.depth = 0
        self.max_depth = max_depth
    
    def fit(self, x, y, par_node={}, depth=0):
        if par_node is None: 
            return None
        elif len(y) == 0:
            return None
        elif self.all_same(y):
            return {'val':y.iloc[0]}
        elif depth >= self.max_depth:
            return None
        else: 
            col, cutoff, entropy = self.find_best_split_of_all(x, y)    # find one split given an information gain 
            y_left = y[x.iloc[:, col] < cutoff] 
            y_right = y[x.iloc[:, col] >= cutoff] 
            par_node = {'col': x.columns[col], 'index_col':col,
                        'cutoff':cutoff,
                       'val': np.round(np.mean(y))}
            par_node['left'] = self.fit(x[x.iloc[:, col] < cutoff], y_left, {}, depth+1)
            par_node['right'] = self.fit(x[x.iloc[:, col] >= cutoff], y_right, {}, depth+1) 
            self.depth += 1 
            self.trees = par_node
            return par_node
    
    #all features versus values, get best
    def find_best_split_of_all(self, x, y):
        #print(x.shape, y.shape)
        col = None
        min_entropy = 1
        cutoff = None
        for i, c in enumerate(x.columns): #x.T es la transpuesta de x (x.T analiza las columnas)
                                          #x.columns son las columnas 
            entropy, cur_cutoff = self.find_best_split(x[c], y)
            if entropy == 0:    # find the first perfect cutoff. Stop Iterating
                return i, cur_cutoff, entropy
            elif entropy <= min_entropy:
                min_entropy = entropy
                col = i
                cutoff = cur_cutoff
        return col, cutoff, min_entropy
    
    #one feature versus values
    def find_best_split(self, col, y):
        min_entropy = 10
        n = len(y)
        for value in set(col):
            y_predict = col < value #get which ones are less than
            my_entropy = get_entropy(y_predict, y)
            if my_entropy <= min_entropy:
                min_entropy = my_entropy
                cutoff = value
        return min_entropy, cutoff
    
    def all_same(self, items):
        return all(x == items.iloc[0] for x in items) 
                                           
    def predict(self, x):
        tree = self.trees
        x = x.reset_index(drop=True)
        results = np.array([0]*x.shape[0])
        
        for i, row in x.iterrows():
            rowList = list(row) #Convertir filas en listas
            results[i] = self._get_prediction(rowList)
        return results
    
    def _get_prediction(self, row):
        cur_layer = self.trees
        while cur_layer is not None and cur_layer.get('cutoff'):
            if row[cur_layer['index_col']] < cur_layer['cutoff']:
                cur_layer = cur_layer['left']
            else:
                cur_layer = cur_layer['right']

        if cur_layer is not None:
            return cur_layer.get('val')
        else:
            return 0

In [387]:
def metrics(X_test, y_test, y_pred):
    score = float(sum(y_pred == y_test))/ float(len(y_test))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    
    metrics = [score, accuracy, precision, recall]
    
    """"print('Score: ' + str(score))
    print('Accuracy: ' + str(recall))
    print('Precision: ' + str(precision))
    print('Recall: ' + str(accuracy))"""
    
    return metrics

In [388]:
def compare(our_metrics, sklearn_metrics):
    
    res = pd.DataFrame([[our_metrics[0], sklearn_metrics[0]],
                  [our_metrics[1], sklearn_metrics[1]], 
                  [our_metrics[2], sklearn_metrics[2]],
                  [our_metrics[3], sklearn_metrics[3]]],
             ['Score','Accuracy','Precision','Recall'],    
             ['Our Implementation', 'Sklearn\'s Implementation'])
    return res

In [389]:
df = pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [390]:
#Feature Engineering
#Todas las caracteristicas que ofrece el set de datos se conseideraron importantes 
#para determinar si el vino es bueno o malo

features=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
y = df.quality
Outcome = []

for i in y:
    if(y[i] <= 5):
        Outcome.append(0)
    else:
        Outcome.append(1)
        
df['Outcome'] = Outcome
x=df[features]
y=df.Outcome

In [391]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.50, random_state=100)

In [392]:
model = DecisionTreeClassifier(max_depth=10)
tree = model.fit(X_train, y_train)

pprint(tree)

{'col': 'alcohol',
 'cutoff': 11.0,
 'index_col': 10,
 'left': {'col': 'fixed acidity',
          'cutoff': 10.1,
          'index_col': 0,
          'left': {'col': 'total sulfur dioxide',
                   'cutoff': 55,
                   'index_col': 6,
                   'left': {'col': 'chlorides',
                            'cutoff': 0.067,
                            'index_col': 4,
                            'left': {'col': 'fixed acidity',
                                     'cutoff': 7.0,
                                     'index_col': 0,
                                     'left': {'val': 0},
                                     'right': {'col': 'free sulfur dioxide',
                                               'cutoff': 13,
                                               'index_col': 5,
                                               'left': {'col': 'sulphates',
                                                        'cutoff': 0.92,
                                 

In [393]:
X_train.shape

(799, 11)

In [394]:
y_pred = model.predict(X_test)
#print(y_pred)
#print(y_test)

In [395]:
#Metricas para la clase DecisionTreeClassifier
our_metrics = metrics(X_test, y_test, y_pred)

In [415]:
#Arbol de decision usando la libreria Sklearn 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.50, random_state=100)

sklearn_metrics = []
def SkDecisionTree():
    clf = DecisionTreeClassifier(max_depth=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)

    text_representation = tree.export_text(clf)
    print(text_representation)
    
    global sklearn_metrics
    sklearn_metrics = metrics(X_test, y_test, y_pred)
    print(sklearn_metrics)
    

In [416]:
SkDecisionTree()

|--- feature_10 <= 11.55
|   |--- feature_1 <= 0.38
|   |   |--- feature_10 <= 10.75
|   |   |   |--- feature_0 <= 14.65
|   |   |   |   |--- feature_6 <= 13.50
|   |   |   |   |   |--- feature_2 <= 0.48
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- feature_2 >  0.48
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_6 >  13.50
|   |   |   |   |   |--- feature_8 <= 3.00
|   |   |   |   |   |   |--- feature_4 <= 0.07
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- feature_4 >  0.07
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- feature_8 >  3.00
|   |   |   |   |   |   |--- feature_9 <= 1.00
|   |   |   |   |   |   |   |--- feature_1 <= 0.38
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- feature_1 >  0.38
|   |   |   |   |   |   |   |   |--- feature_4 <= 0.07
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- feature_4 >  0.07
|   |   |   |   | 

In [417]:
print(sklearn_metrics)
compare(our_metrics, sklearn_metrics)

[0.85125, 0.85125, 0.8564444368507381, 0.85125]


Unnamed: 0,Our Implementation,Sklearn's Implementation
Score,0.8575,0.85125
Accuracy,0.8575,0.85125
Precision,0.866448,0.856444
Recall,0.8575,0.85125
