In [13]:
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
%matplotlib inline

In [14]:
data=np.float64(np.load('MNIST.npy'))
labels=np.float32(np.load('MNIST_labels.npy'))
n=len(data)
p=len(data[0])
for i in range(0,n):
    data[i]=data[i]/255 # The value lie in [0,1]
print (data.shape)

(70000, 784)


In [15]:
# Get Mnist data and split into train validation and test
def get_mnist():

    data=np.float64(np.load('MNIST.npy'))
    labels=np.float32(np.load('MNIST_labels.npy'))
    #print(data.shape)
    
    data=np.float32(data)/255.
    
    train_dat=data[0:50000]
    train_labels=labels[0:50000]
    
    val_dat=data[50000:60000]
    val_labels=labels[50000:60000]
    
    test_dat=data[60000:70000]
    test_labels=labels[60000:70000]
    
    return (train_dat, train_labels), (val_dat, val_labels), (test_dat, test_labels)

In [16]:
(Train,TrainLabel),(Val,ValLabel),(Test,TestLabel)=get_mnist()

## AdaBoostClassfier

In [26]:
nTrees=MinimumErrorTree
clfb=RandomForestClassifier(n_estimators=1,min_samples_split=2,max_features=100,criterion="entropy")
AdaB=AdaBoostClassifier(clfb,n_estimators=nTrees,algorithm="SAMME")
start_time = time.time()
AdaB.fit(Train,TrainLabel)
end_time = time.time()
t=end_time-start_time
print("The training time is about",round(t,4))

The training time is about 23.6632


In [27]:
temp=AdaB.predict(Train)
e1=sum((temp!=TrainLabel))/len(temp)

temp=AdaB.predict(Val)
e2=sum((temp!=ValLabel))/len(temp)

temp=AdaB.predict(Test)
e3=sum((temp!=TestLabel))/len(temp)

print("The training   error rate is ",round(e1,4))
print("The validation error rate is ",round(e2,4))
print("The test       error rate is ",round(e3,4))

The training   error rate is  0.0001
The validation error rate is  0.0511
The test       error rate is  0.0477


The training error rate is similarly with the result of Random Forest Classifier but test error rate is slightly larger than those of Random Forest Classifier.

Since adaboost algorithm updates the weight by using misclassified training sample, it is easily overfitted. This is the reason why the error of training sample and the error of val, test have pretty large difference.

## Different boosting reweighting rule

In [28]:
nTrees=MinimumErrorTree
Weights=1/np.repeat(1,len(Train)) #Initial Weights
Predict_Prob_Train=[]
Predict_Prob_Val=[]
TreeList=[]                      #Store each tree.

In [29]:
N=range(0,nTrees)
start_time = time.time()
for i in N:
    if(i==0):
        #Grow one tree
        clf=RandomForestClassifier(n_estimators=1,min_samples_split=2,max_features=100,criterion="entropy")
        TreeList.append(clf)
        TreeList[i].fit(Train,TrainLabel)
        
        #Update weights by using error rate for each sample
        temp=TreeList[i].predict(Train)
        e=sum((temp!=TrainLabel))/len(temp)
        for j in range(len(temp)):
            if(temp[j]!=TrainLabel[j]):
                Weights[j]=Weights[j]/e
        
        #Probability of output by each trees
        temp1=np.array(TreeList[i].predict_proba(Train))
        temp2=np.array(TreeList[i].predict_proba(Val))
        Predict_Prob_Train.append(temp1)
        Predict_Prob_Val.append(temp2)
    else:
        #Grow one tree
        clf=RandomForestClassifier(n_estimators=1,min_samples_split=2,max_features=100,criterion="entropy")
        TreeList.append(clf)
        TreeList[i].fit(Train,TrainLabel,Weights)
        
        #Update weights by using error rate for each sample
        temp=TreeList[i].predict(Train)
        e=sum((temp!=TrainLabel))/len(temp)
        for j in range(len(temp)):
            if(temp[j]!=TrainLabel[j]):
                Weights[j]=Weights[j]/e
        
        #Add terminal probability of all trees
        temp1=np.array(TreeList[i].predict_proba(Train))
        temp2=np.array(TreeList[i].predict_proba(Val))
        Predict_Prob_Train=Predict_Prob_Train+temp1
        Predict_Prob_Val=Predict_Prob_Val+temp2

end_time = time.time()
t=end_time-start_time
print("The training time is about",round(t,4))

#Average the terminal probabilities of trees
Predict_Prob_Train=np.array(Predict_Prob_Train)/nTrees
Predict_Prob_Val=np.array(Predict_Prob_Val)/nTrees

The training time is about 27.476


In [30]:
e1=0
for i in range(len(Train)):
    temp=Predict_Prob_Train[0][i].argmax() #Find the label with the highest probability
    if(temp!=TrainLabel[i]):
        e1+=1
e1=e1/len(Train)

e2=0
for i in range(len(Val)):
    temp=Predict_Prob_Val[0][i].argmax()
    if(temp!=ValLabel[i]):
        e2+=1
e2=e2/len(Val)

In [31]:
#Predict test data with all trees we saved

def Predict_data(data):
    Predict_Prob=np.zeros([len(data),len(set(TrainLabel))])
    for i in TreeList:
        Predict_Prob=Predict_Prob+np.array(i.predict_proba(data))
    Predict_Prob=Predict_Prob/len(data)
    
    temp=[]
    for i in range(len(data)):
        temp.append(Predict_Prob[i].argmax())
    return(temp)

In [32]:
Predict_Test=Predict_data(Test)
e3=sum(Predict_Test!=TestLabel)/len(Test)

In [33]:
print("The training   error rate is ",round(e1,4))
print("The validation error rate is ",round(e2,4))
print("The test       error rate is ",round(e3,4))

The training   error rate is  0.0
The validation error rate is  0.0473
The test       error rate is  0.041


We change the weight updating method from AdaBoost. It also has very small training error rate. The validation and error rates are between those of Random Forest and those of AdaBoost. The new weight updating method shows the lower overfitted result.