In [16]:
import pandas as pd
import numpy as np
import sklearn
import random
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

In [2]:
#mnist dataset with flatten images.
#use np.reshape(-1) directly with flatten through a loop
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=False)
mnist.data.shape

(70000, 784)

In [3]:
X = mnist.data.astype('float32')
y = mnist.target.astype('int64')

In [12]:
#Function for 20% data for testing and another 20% for validation. 
def train_test_valid_split(df_x, df_y):
  #60:20:20
  train = int(70000*6/10)
  valid = train + int(70000*2/10)
  X_train = df_x[:train]
  y_train = df_y[:train]
  X_valid = df_x[train:valid]
  y_valid = df_y[train:valid]
  X_test = df_x[valid:]
  y_test = df_y[valid:]
  return X_train, y_train, X_valid, y_valid, X_test, y_test

#Function for binary vectors
def binarization(data):
  a = data.copy()
  a /= 255.0
  a = (a > 0.5).astype(np.int_)
  return a 

#Decison Tree based on 10 randomly selected samples
def dt_train_valid(xtrain, ytrain, xvalid, yvalid):
  tran = []
  val = []
  col = []
  for i in range(0,10):
    x = random.randint(0,783)
    s = xtrain[:,x]
    ss = xvalid[:,x]
    tran.append(s)
    val.append(ss)
    col.append(x)
  trin = np.array(tran)
  vald = np.array(val)
  train = trin.T
  valid = vald.T
  dt = DecisionTreeClassifier(max_depth = 5)
  model = dt.fit(train, ytrain)
  ypred = model.predict(valid)
  total = len(yvalid)
  right = 0
  for w in range(0,total):
    if (int(ypred[w]) == yvalid[w]):
      right += 1
  acc = right/total
  return model,acc,col

def test_pred(xtest, ytest, model, feature):
  tes = []
  for i in feature:
    x = xtest[:,i]
    tes.append(x)
  tst = np.array(tes)
  test = tst.T
  ypred = model.predict(test) 
  return ypred



In [5]:
#Decison Tree based on 10 randomly selected samples
X_TRANS = binarization(X)
X_train, y_train, X_valid, y_valid, X_test, y_test = train_test_valid_split(X_TRANS, y)
model , accuracy, feature = dt_train_valid(X_train, y_train, X_valid, y_valid)
print("The validation accuracy on this dataset is",int(accuracy*100),"%.")

The validation accuracy on this dataset is 33 %.


In [69]:
#50 Decision Trees together and the validation accuracies
models = []
valid_accs = []
a = 0.58
features = []

for i in range(0,50):
  m,ac,f = dt_train_valid(X_train, y_train, X_valid, y_valid)
  models.append(m)
  valid_accs.append(ac) 
  features.append(f)

In [18]:
for i in range(0,50):
  print("The validation accuracy on this dataset is",int(valid_accs[i]*100),"% for tree number: ",i)

The validation accuracy on this dataset is 30 % for tree number:  0
The validation accuracy on this dataset is 35 % for tree number:  1
The validation accuracy on this dataset is 39 % for tree number:  2
The validation accuracy on this dataset is 29 % for tree number:  3
The validation accuracy on this dataset is 36 % for tree number:  4
The validation accuracy on this dataset is 26 % for tree number:  5
The validation accuracy on this dataset is 34 % for tree number:  6
The validation accuracy on this dataset is 24 % for tree number:  7
The validation accuracy on this dataset is 27 % for tree number:  8
The validation accuracy on this dataset is 26 % for tree number:  9
The validation accuracy on this dataset is 29 % for tree number:  10
The validation accuracy on this dataset is 40 % for tree number:  11
The validation accuracy on this dataset is 27 % for tree number:  12
The validation accuracy on this dataset is 23 % for tree number:  13
The validation accuracy on this dataset is 3

In [70]:
#Weighted Classification
ypred_all = []
countt = a*len(y_test)
for i in range(0,50):
  yprd = test_pred(X_test, y_test, models[i], features[i])
  ypred_all.append(yprd)

In [8]:
#Weighted Classification
votes = np.zeros((14000, 10))
for j in range(0,14000):
  for i in range(0,50):
    num = int(ypred_all[i][j])
    votes[j,num] += valid_accs[i] 

In [9]:
#Weighted Classification
TestPred = []
count = 0
for i in range(0,14000):
  predict = np.argmax(votes[i])
  TestPred.append(predict)
  if (int(predict) == y_test[i]):
    count += 1
#Accuracy of ensemble model on test set
TestAccuracy = count/14000
print("Accuracy of the ensemble model on test set is:",int(TestAccuracy*100),"%.")

Accuracy of the ensemble model on test set is: 54 %.


Adaboost using DT: 

In [19]:
class AdaBoost:

    def __init__(self):
        self.stumps = None
        self.stump_weights = None
        self.errors = None
        self.sample_weights = None

In [56]:
#Defining Adaboost with weak classifier as DT with max_depth 5.
def fit(self, X, y, iters):
    n = X.shape[0]

    # init numpy arrays
    self.sample_weights = np.zeros(shape=(iters, n))
    self.stumps = np.zeros(shape=iters, dtype=object)
    self.stump_weights = np.zeros(shape=iters)
    self.errors = np.zeros(shape=iters)

    # initialize weights uniformly
    self.sample_weights[0] = np.ones(shape=n) / n

    for t in range(iters):
        # fit  weak learner
        curr_sample_weights = self.sample_weights[t]
        dt = DecisionTreeClassifier(max_depth=5)
        stump = dt.fit(X, y, sample_weight=curr_sample_weights)

        # calculate error and stump weight from weak learner prediction
        stump_pred = stump.predict(X)
        err = curr_sample_weights[(stump_pred != y)].sum()# / n
        stump_weight = np.log((1 - err) / err) / 2
        xyu = []
        for i in range(0,len(stump_pred)):
          if (int(stump_pred[i]) == y_train[i]):
            xyu.append(int(1))
          else:
            xyu.append(int(-1))
        x = np.array(xyu)

        # update sample weights
        new_sample_weights = (curr_sample_weights * np.exp(-stump_weight * (x)))
        
        new_sample_weights /= new_sample_weights.sum()

        # If not final iteration, update sample weights for t+1
        if t+1 < iters:
            self.sample_weights[t+1] = new_sample_weights

        # save results of iteration
        self.stumps[t] = stump
        self.stump_weights[t] = stump_weight
        self.errors[t] = err

    return self


def predict(self, X):
    """ Make predictions using already fitted model """
    stump_preds = np.array([stump.predict(X) for stump in self.stumps])
    return np.dot(self.stump_weights, stump_preds)

In [71]:
AdaBoost.fit = fit
AdaBoost.predict = predict

clf = AdaBoost().fit(X_train, y_train, iters=20)

predtn = clf.predict(X_test)
count = 0
for i in range(len(predtn)):
  if (int(predtn[i]) == y_test[i]):
    count += 1

Test accuracy:


In [74]:
#We report the accuracy on the test set using this block of code
accuracy = (countt/len(predtn))

print('Test accuracy:', accuracy)

Test accuracy: 58.0
