In [1]:
import pandas as pd
import numpy as np
profit   = [23, 31, 54, 43,22,12,25]
research = [0,61,21, 65, 43,12,11,54]
travel   = [65,25,53,5,8,12,90,76]
revenue = [1,1,1,-1,1,-1,1]
df = pd.DataFrame(list(zip(profit, research, travel, revenue)),
              columns=['profit', 'research', 'travel', 'revenue'])
df

Unnamed: 0,profit,research,travel,revenue
0,23,0,65,1
1,31,61,25,1
2,54,21,53,1
3,43,65,5,-1
4,22,43,8,1
5,12,12,12,-1
6,25,11,90,1


In [2]:
df['weights1'] = 1/len(df)
df

#X_train and Y_train split
X_train = df.iloc[0:len(df),0:3]
y_train = df.iloc[0:len(df),3]
y_train
df

Unnamed: 0,profit,research,travel,revenue,weights1
0,23,0,65,1,0.142857
1,31,61,25,1,0.142857
2,54,21,53,1,0.142857
3,43,65,5,-1,0.142857
4,22,43,8,1,0.142857
5,12,12,12,-1,0.142857
6,25,11,90,1,0.142857


In [3]:
from sklearn.tree import DecisionTreeClassifier
#fitting the DT model with depth one
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=1)
clf = clf_gini.fit(X_train, y_train)
#prediction
y_pred = clf_gini.predict(df.iloc[0:len(df),0:3])
y_pred
#adding a column pred1 after the first round of boosting
df['pred1'] = y_pred

In [4]:
#misclassified = 0 if the label and prediction are same
df.loc[df.revenue != df.pred1, 'misclassified1'] = 1
df.loc[df.revenue == df.pred1, 'misclassified1'] = 0
df

Unnamed: 0,profit,research,travel,revenue,weights1,pred1,misclassified1
0,23,0,65,1,0.142857,1,0.0
1,31,61,25,1,0.142857,1,0.0
2,54,21,53,1,0.142857,1,0.0
3,43,65,5,-1,0.142857,-1,0.0
4,22,43,8,1,0.142857,-1,1.0
5,12,12,12,-1,0.142857,-1,0.0
6,25,11,90,1,0.142857,1,0.0


In [5]:
from math import log,exp
#error calculation
e1 = sum(df['misclassified1'] * df['weights1'])
#calculation of alpha (performance)
alpha1 = 0.5*log((1-e1)/e1)

#update weight
new_weight = df['weights1']*np.exp(-1*alpha1*df['revenue']*df['pred1'])
#normalized weight
z = sum(new_weight)
normalized_weight = new_weight/sum(new_weight)
df['weights2'] = round(normalized_weight,4)
df

Unnamed: 0,profit,research,travel,revenue,weights1,pred1,misclassified1,weights2
0,23,0,65,1,0.142857,1,0.0,0.0833
1,31,61,25,1,0.142857,1,0.0,0.0833
2,54,21,53,1,0.142857,1,0.0,0.0833
3,43,65,5,-1,0.142857,-1,0.0,0.0833
4,22,43,8,1,0.142857,-1,1.0,0.5
5,12,12,12,-1,0.142857,-1,0.0,0.0833
6,25,11,90,1,0.142857,1,0.0,0.0833


In [6]:
from sklearn.tree import DecisionTreeClassifier
#fitting the DT model with depth one
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=1)
clf = clf_gini.fit(X_train, y_train)
#prediction
y_pred = clf_gini.predict(df.iloc[0:len(df),0:3])
y_pred
#adding a column pred1 after the first round of boosting
df['pred2'] = y_pred

In [7]:
#misclassified = 0 if the label and prediction are same
df.loc[df.revenue != df.pred2, 'misclassified2'] = 1
df.loc[df.revenue == df.pred2, 'misclassified2'] = 0
df

Unnamed: 0,profit,research,travel,revenue,weights1,pred1,misclassified1,weights2,pred2,misclassified2
0,23,0,65,1,0.142857,1,0.0,0.0833,1,0.0
1,31,61,25,1,0.142857,1,0.0,0.0833,1,0.0
2,54,21,53,1,0.142857,1,0.0,0.0833,1,0.0
3,43,65,5,-1,0.142857,-1,0.0,0.0833,-1,0.0
4,22,43,8,1,0.142857,-1,1.0,0.5,-1,1.0
5,12,12,12,-1,0.142857,-1,0.0,0.0833,-1,0.0
6,25,11,90,1,0.142857,1,0.0,0.0833,1,0.0


In [8]:
from math import log,exp
#error calculation
e2 = sum(df['misclassified2'] * df['weights2'])
#calculation of alpha (performance)
alpha2 = 0.5*log((1-e2)/e2)

#update weight
new_weight = df['weights2']*np.exp(-1*alpha1*df['revenue']*df['pred2'])
#normalized weight
z = sum(new_weight)
normalized_weight = new_weight/sum(new_weight)
df['weights3'] = round(normalized_weight,4)
df


Unnamed: 0,profit,research,travel,revenue,weights1,pred1,misclassified1,weights2,pred2,misclassified2,weights3
0,23,0,65,1,0.142857,1,0.0,0.0833,1,0.0,0.0238
1,31,61,25,1,0.142857,1,0.0,0.0833,1,0.0,0.0238
2,54,21,53,1,0.142857,1,0.0,0.0833,1,0.0,0.0238
3,43,65,5,-1,0.142857,-1,0.0,0.0833,-1,0.0,0.0238
4,22,43,8,1,0.142857,-1,1.0,0.5,-1,1.0,0.8572
5,12,12,12,-1,0.142857,-1,0.0,0.0833,-1,0.0,0.0238
6,25,11,90,1,0.142857,1,0.0,0.0833,1,0.0,0.0238


In [9]:
print(alpha1, alpha2)
alpha = []
print()
for i in range(len(df)):
    weight = float( df.pred1[i]*alpha1 + df.pred2[i]*alpha2)
    alpha.append(weight)
print(alpha)
print(np.sign(alpha))
df['y_pred'] = np.sign(alpha)
df

0.8958797346140276 0.0

[0.8958797346140276, 0.8958797346140276, 0.8958797346140276, -0.8958797346140276, -0.8958797346140276, -0.8958797346140276, 0.8958797346140276]
[ 1.  1.  1. -1. -1. -1.  1.]


Unnamed: 0,profit,research,travel,revenue,weights1,pred1,misclassified1,weights2,pred2,misclassified2,weights3,y_pred
0,23,0,65,1,0.142857,1,0.0,0.0833,1,0.0,0.0238,1.0
1,31,61,25,1,0.142857,1,0.0,0.0833,1,0.0,0.0238,1.0
2,54,21,53,1,0.142857,1,0.0,0.0833,1,0.0,0.0238,1.0
3,43,65,5,-1,0.142857,-1,0.0,0.0833,-1,0.0,0.0238,-1.0
4,22,43,8,1,0.142857,-1,1.0,0.5,-1,1.0,0.8572,-1.0
5,12,12,12,-1,0.142857,-1,0.0,0.0833,-1,0.0,0.0238,-1.0
6,25,11,90,1,0.142857,1,0.0,0.0833,1,0.0,0.0238,1.0


In [10]:
print(alpha1, alpha2)
alpha = []
print()
for i in range(len(df)):
    weight = (df.weights2[i]*df.pred1[i] + df.weights3[i]*df.pred2[i] )
    alpha.append(weight)
print(np.sign(alpha))

0.8958797346140276 0.0

[ 1.  1.  1. -1. -1. -1.  1.]
