# Übung 10 Ada Boost - Rainier Robles & Valentin Wolf

In [513]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Importing IMDB-data

In [497]:
def parse_libsvm(fpath):
    """Parses each entry (==line) in a LIBSVM files to a dict
    in dicts and stores the label in lables"""
    dicts = []
    labels = []
    for l in open(fpath):
        dictonary = {}
        label = int(l.split()[0])
        tokens = l.split()[1:]
        for token in tokens:
            token = token.split(":")
            key = int(token[0])
            val = int(token[1])
            dictonary[key] = val
        dicts.append(dictonary)
        labels.append(label)
    return dicts, labels

X_train,y_train = parse_libsvm('labeledBow.feat') 

X_test,y_test = parse_libsvm('labeledBowTest.feat')

In [498]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
print('mean:',y_train.mean())

# Average Score is ~5.5, so we change the lables 1 if above average and 0 otherwise
y_train[y_train < 5] = -1
y_train[y_train > 5] = 1

y_test[y_test < 5] = -1
y_test[y_test > 5] = 1

mean: 5.47772


# Building weak classifiers

Each classifier only looks at one word and predicts if the existence/absence of the word is an indicator of a positive/negative comment.

In [400]:
class Classifier():
    def error_rate(self,truth, pred):
        """gets two vectors, returns (wrongly classified / total)"""
        return 1 - self.accuracy(truth, pred)
    
    def accuracy(self,truth,pred):
        return np.mean(truth == pred)

In [401]:
class weak_cls(Classifier):
    def __init__(self,token):
        self.factor = -1
        self.token = token
        
    def fit(self,X_train,y_train):
        """
        X_train: list of dicts with the tokenized words as ints
        y_train: lables -1,1
        Classifies 1 if token is in dict and -1 otherwise
        
        if accuracy is <50% it flips those values 
        => accuracy is always >=50% """
        
        pred = self.predict(X_train)

        if self.accuracy(y_train,pred) < 0.5:
            self.factor = self.factor * -1
        
    def predict(self,X_train):
        num_samples = len(X_train)
        pred_y = np.zeros(num_samples)
        
        for j in range(num_samples):
            if self.token in X_train[j]:
                pred_y[j] = 1 * self.factor
            else:
                pred_y[j] = -1 * self.factor
        return pred_y

In [507]:
#creating of the weak learners with acc >50%
weak_clss = []
j=0
for i in range(10000):
    cls = weak_cls(i)
    cls.fit(X_train,y_train)
    acc = cls.accuracy(y_train,cls.predict(X_train))
    if acc > 0.5:
        j+=1
        weak_clss.append(cls)
print(len(weak_clss))

9828


In [1]:
class AdaBoost2(Classifier):
    def train(self, X, y , weak_learners, X_test=None, y_test=None,
              iterations = 10, print_freq=10):
        self.weights = np.full(y.shape, 1/y.shape[0])
        self.assemble = []
        
        def weightedSumErr(learner_i,y):
            """the weighted sum error for misclassified points of learner_i
            normalized by the total sum of wheights"""
            pred = preds[learner_i]
            return np.sum(self.weights[y != pred]) #/ np.sum(self.weights)
            
        preds = np.zeros((len(weak_learners),y.shape[0]))
        
        for i in range(len(weak_learners)):
            preds[i] = (weak_learners[i].predict(X))

        for iteration in range(iterations):
            best_learner_i = 0 
            min_score = weightedSumErr(0,y)
            
            for i in range(len(weak_learners)):
                score = weightedSumErr(i,y)
                if score < min_score:
                    best_learner_i = i
                    min_score = score
            
            alpha = 0.5 * np.log((1-min_score)/ min_score)
            best_learner = weak_learners[best_learner_i]
            self.assemble.append((best_learner,alpha))
            
            # update weights
            # if pred was correct the weight gets smaller as exp(something<0) < 1
            # and bigger if it was incorrectly classified
            pred = preds[best_learner_i]
            self.weights[y == pred] = self.weights[y == pred] * np.exp(-alpha)
            self.weights[y != pred] = self.weights[y != pred] * np.exp(alpha)
            
            weak_learners.remove(weak_learners[best_learner_i])
            preds = np.delete(preds,best_learner_i,0)
            
            # printing the results every print_freq (or the last) iterations:
            if iteration % print_freq == 0 or iteration == iterations - 1:
                print("iteration: ", 
                      iteration, 
                      "best_learner.accuracy: ",
                      best_learner.accuracy(y_train,best_learner.predict(X_train)),
                      "min_score: ",
                      min_score,
                      "alpha: ",
                      alpha)
                print("Total training accuracy: ", self.accuracy(y,self.predict(X)))
                try: # if X_test and y_test were given as parameters:
                    print("Total Test accuracy: ", self.accuracy(y_test,self.predict(X_test)))
                except TypeError: 
                    pass

    def predict(self,X):
        samples = len(X)
        pred = np.zeros(len(X))
        for learner, alpha in self.assemble:
            pred += learner.predict(X) * alpha

        pred = np.sign(pred)
        return pred

NameError: name 'Classifier' is not defined

In [516]:
weak_clss_copy = weak_clss.copy()
ada2 = AdaBoost2()
ada2.train(X_train,y_train,weak_clss_copy,iterations=100,X_test=X_test,y_test=y_test,print_freq=10)

iteration:  0 best_learner.accuracy:  0.61532 min_score:  0.38468 alpha:  0.234865318922
Total training accuracy:  0.61532
Total Test accuracy:  0.61772
iteration:  10 best_learner.accuracy:  0.54956 min_score:  0.442015272034 alpha:  0.116493579588
Total training accuracy:  0.7156
Total Test accuracy:  0.71376
iteration:  20 best_learner.accuracy:  0.53604 min_score:  0.452394084434 alpha:  0.0955011139387
Total training accuracy:  0.74256
Total Test accuracy:  0.74016


KeyboardInterrupt: 

In [527]:
weak_clss_copy = weak_clss.copy()
ada2 = AdaBoost2()
ada2.train(X_train,y_train,weak_clss_copy,iterations=100,X_test=X_test,y_test=y_test,print_freq=10)

iteration:  0 best_learner.accuracy:  0.61532 min_score:  0.38468 alpha:  0.234865318922
Total training accuracy:  0.61532
Total Test accuracy:  0.61772
iteration:  10 best_learner.accuracy:  0.54956 min_score:  0.346566909232 alpha:  0.317082120813
Total training accuracy:  0.71932
Total Test accuracy:  0.71912
iteration:  20 best_learner.accuracy:  0.5338 min_score:  0.324630579116 alpha:  0.366285987592
Total training accuracy:  0.7546
Total Test accuracy:  0.75608
iteration:  30 best_learner.accuracy:  0.51784 min_score:  0.315037032284 alpha:  0.388337290175
Total training accuracy:  0.77084
Total Test accuracy:  0.77152


KeyboardInterrupt: 

In [520]:
np.log((1-0.4)/100)

-5.1159958097540823

In [521]:
ada.weights

array([ 0.52871163,  0.52156632,  1.04731717, ...,  1.11041813,
        2.10623709,  0.58765527])

In [525]:
np.full(y_train.shape, 1/y_train.shape[0])

array([  4.00000000e-05,   4.00000000e-05,   4.00000000e-05, ...,
         4.00000000e-05,   4.00000000e-05,   4.00000000e-05])

In [523]:
y_train.shape

(25000,)