In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from ensemble_learning import DecisionTree, entropy, accuracy, compute_gain

In [3]:
def compute_alpha(y_true, y_pred, weight):
    
#     epsilon = weight * y_true * y_pred
#     epsilon = 0.5 - 0.5 * epsilon.sum()
    
    epsilon = weight[y_true != y_pred].sum()
    
    
    
#     print('eps', epsilon)
    alpha = 0.5 * np.log((1-epsilon)/(epsilon))
    
    return alpha

def compute_next_weight(y_true, y_pred, weight, alpha):
    
    next_weight = (y_true * y_pred).astype(np.float)
    next_weight *= -alpha
    
    next_weight = np.exp(next_weight)
    
    next_weight *= weight
    
    Zt = next_weight.sum()
    
    next_weight /= Zt
    
    return next_weight

In [4]:
class Adaboost(object):
    
    def __init__(self, function=entropy):
        self._function = function
        
    def fit(self, X, y, T=10):
        
        # copy in variables
        self._X = X.copy()
        self._y = y.copy()
        self._T = T
        
        # initialize weigths
        weight = np.ones_like(y) / len(y)
        
        # initialize lists to store variables
        self._stump=[]
        self._alpha=[]
        self._key=[]
        
        for i in range(self._T):
#             print(weight)

            #find decision stump i
            stump = DecisionTree(max_depth=1)
            stump.fit(self._X, self._y, weight=weight)
            self._stump.append(stump)
            
#             print(compute_gain(stump._X, stump._y, weight).argmax())

            # get current predictions
            y_pred = self._stump[i].predict(self._X)

            # find alpha
            alpha = compute_alpha(self._y, y_pred, weight)
            self._alpha.append(alpha)

            # update weights for next stump
            weight = compute_next_weight(self._y, y_pred, weight, alpha)
            
            # append key for troubleshooting
            self._key.append(self._stump[i]._tree.key)
            
    def predict(self, X, binary=True):
        
        self._Xpred = X.copy()
        
        # initialize all predictions to 0
        y_pred = np.zeros(self._Xpred.shape[0])
        
        # loop over all trees T
        for i in range(self._T):
            
            # get current vot
            pred_t = self._stump[i].predict(self._Xpred).astype(np.float)
            pred_t *= self._alpha[i]
            
            # add to total vote
            y_pred += pred_t
            
        # convert to binary output based on the sign
        if binary:
            y_pred[y_pred < 0] = -1
            y_pred[y_pred >= 0] = 1
            y_pred = y_pred.astype(np.int)
            
        return y_pred
        

In [5]:
# read data
cols = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df = pd.read_csv('../data/bank/train.csv', names=cols)

# process numeric input
medians = df.median()

for col in medians.index:
    df[col] = df[col].apply(lambda x: x > medians[col])
    
# process labels
label_dict={'yes': 1, 'no': -1}
df['y'] = df['y'].map(label_dict)

In [6]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,True,services,married,secondary,no,False,yes,no,unknown,False,may,False,False,False,False,unknown,-1
1,True,blue-collar,single,secondary,no,False,yes,yes,cellular,False,feb,True,False,False,False,unknown,-1
2,True,technician,married,secondary,no,True,no,yes,cellular,True,aug,True,False,True,True,success,1
3,True,admin.,married,tertiary,no,False,yes,no,cellular,False,jul,True,False,False,False,unknown,-1
4,False,management,single,tertiary,no,True,no,no,cellular,False,apr,False,False,False,False,unknown,1


In [7]:
X = df.drop(['y'], axis=1).values
y = df['y'].values

In [8]:
model = Adaboost()

In [9]:
model.fit(X, y, T=5)

accuracy(model.predict(X), y)

0.8808

In [10]:
model._key

[11, 11, 11, 11, 11]

In [11]:
model._alpha

[1.0000139153886103,
 2.2204460492503126e-16,
 -4.440892098500628e-16,
 5.55111512312578e-16,
 -4.440892098500628e-16]

In [23]:
w = np.ones_like(y) / len(y)

gain = compute_gain(X, y, w)
idx = gain.argmax()
print(idx, gain[idx])
gain

11 0.05960230138776085


array([0.00076046, 0.01331066, 0.00452142, 0.00662615, 0.00072379,
       0.00371893, 0.01395331, 0.00485239, 0.01811303, 0.00080701,
       0.03646958, 0.0596023 , 0.00409171, 0.01369313, 0.01369313,
       0.03964955])

In [14]:
st = DecisionTree(max_depth=1)
st.fit(X, y, w)

y_pred = st.predict(X)
accuracy(y_pred, y)

0.8808

In [15]:
alpha = compute_alpha(y, y_pred, w)
alpha

1.0000139153886103

In [16]:
w2 = compute_next_weight(y, y_pred, w, alpha)

In [17]:
np.unique(w2)

array([0.00011353, 0.00083893])

In [18]:
gain = compute_gain(X, y, w2)
idx = gain.argmax()
print(idx, gain[idx])
gain

11 0.1509723081407396


array([0.00181219, 0.03144655, 0.0105943 , 0.01558737, 0.00189934,
       0.00888707, 0.03332823, 0.01235735, 0.04681782, 0.00192495,
       0.07108786, 0.15097231, 0.00998718, 0.03026814, 0.03026814,
       0.06791394])

In [19]:
st = DecisionTree(max_depth=1)
st.fit(X, y, w2)

y_pred = st.predict(X)
accuracy(y_pred, y)

0.8808

In [20]:
alpha = compute_alpha(y, y_pred, w2)
alpha

2.2204460492503126e-16

In [21]:
w2 = compute_next_weight(y, y_pred, w2, alpha)

In [22]:
np.unique(w2)

array([0.00011353, 0.00083893])