In [44]:
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity as cos
import time
from itertools import chain
import random
from operator import itemgetter

In [3]:
#Read data
x_train = np.load('../exam_19S/pb1/x_train.npy')
print(np.shape(x_train))
# (60000, 784)
x_test = np.load('../exam_19S/pb1/x_test_usetodebug.npy')
print(np.shape(x_test))
# (10000, 784)
y_train = np.load('../exam_19S/pb1/y_train.npy')
print(np.shape(y_train))
# (60000,)
y_test = np.load('../exam_19S/pb1/y_test.npy')
print(np.shape(y_test))
# (10000,)

(60000, 784)
(10000, 784)
(60000,)
(10000,)


In [29]:
for i in range(10):
    index=y_train==i
    val=int(sum(index)*0.2)
    if i == 0:
        mnist_train=x_train[index][:val]
        train_label=y_train[index][:val]
    else:
        mnist_train=np.vstack((mnist_train,x_train[index][:val]))
        train_label=np.append(train_label,y_train[index][:val])

In [5]:
def decision_stump(data,y,dist,random):
    
    best_f=None
    max_error=0.5
    best_t=None
    best_pred=None
    if random:
        best_f=np.random.randint(data.shape[1])
        best_t,max_error,best_pred=best_threshold(data[:,best_f],y,dist)
    else:
        for f in range(data.shape[1]):
            t,error,pred=best_threshold(data[:,f],y,dist)
    #         print(error)
            if(abs(0.5-error)>abs(0.5-max_error)):
                max_error=error
                best_f=f
                best_t=t
                best_pred=pred
                if(abs(0.5-max_error)>0.35):
                    break
    return best_f,best_t,max_error,best_pred

In [6]:
def thresholds(a):
    u=np.unique(a)
    u.sort()
    t=[(u[i*3-1]+u[i*3])/2 for i in range(1,int(len(u)/3))]
    return [u.min()]+t+[u.max()]

In [7]:
def best_threshold(x,y,d):
    max_error=0.5
    best_t=None
    best_pred=None
    weighted_x=x
    t=thresholds(weighted_x)
    for i in t:
        pred=np.ones(len(y))
        pred[weighted_x<i]=-1.0
        error=sum(d[pred!=y])
        if(abs(0.5-error)>abs(0.5-max_error)):
            max_error=error
            best_t=i
            best_pred=pred
    return best_t,max_error,best_pred

In [8]:
def predict(x_f,alpha,t):
    val=np.ones(len(x_f))
    val[x_f<t]=-1.0
    return alpha*val

In [9]:
def generate_roc(actual,log_odds):
    index=[min(log_odds)]+[np.mean(i) for i in np.array_split(log_odds,10)]+[max(log_odds)]
    tpr=[]
    fpr=[]
    for k in index:
        ans=[1.0 if i> k else -1. for i in log_odds]
        matrix=confusion_matrix(actual,ans)
        tpr.append(matrix[0,0]/sum(matrix[0]))
        fpr.append(matrix[1,0]/sum(matrix[1]))
    tpr.sort()
    fpr.sort()
    return tpr,fpr

In [10]:
def confusion_matrix(actual,pred):
    tp=0
    tn=0
    fp=0
    fn=0
    for i in range(len(actual)):
        if actual[i]==-1.:
            if actual[i]==pred[i]:
                tn+=1
            else:
                fp+=1
        elif actual[i]==1. :
            if actual[i]==pred[i]:
                tp+=1
            else:
                fn+=1
    return np.array([[tp,fn],[fp,tn]])

In [11]:
def calculate_auc(x,y):
    val=0.0
    for i in range(1,len(x)):
        val+= (x[i]-x[i-1])*(y[i]+y[i-1])
    return val/2

In [81]:
def adaboost(x_train,x_test,y_train,y_test,epochs,random):
    model=[]
    local_error=[]
    train_we=[]
    test_we=[]
    test_auc=[]
    fx_val_train=np.zeros(len(x_train))
    fx_val_test=np.zeros(len(x_test))
    dt=np.ones(len(x_train))/len(x_train)
    for t in range(1,epochs+1):

        f,thresh,error,pred= decision_stump(x_train,y_train,dt,random)   #1 round error
        
        local_error.append(error)
        alpha = np.log((1-error)/error)/2
        
        model.append((alpha,(f,thresh)))
        
        fx_val_train+=predict(x_train[:,f],alpha,thresh)
        fx_val_test+=predict(x_test[:,f],alpha,thresh)
        
        update=np.exp(-alpha*pred*y_train)
#         print(pred*y_train)
#         print(update)
        dt=dt*update
        dt=dt/sum(dt)
        
        pred_train=np.ones(len(y_train))
        pred_train[fx_val_train<0]=-1.0
        
        err_train=np.mean(pred_train!=y_train)
        train_we.append(err_train)
        
        pred_test=np.ones(len(y_test))
        pred_test[fx_val_test<0]=-1.0
        
        err_test=np.mean(pred_test!=y_test)
        test_we.append(err_test)
        
        tpr,fpr=generate_roc(y_test,fx_val_test)
        
        auc=calculate_auc(fpr,tpr)
        test_auc.append(auc)
        if t%100 is 0:
            print("Round:",t,",Feature:",f,",Threshold:",thresh,",Round_err:",error,",Train_err:",err_train,",Test_err:",err_test,"AUC:",auc)
    return model, pred_test

In [13]:
def ecoc(train, test,classes,functions):
    
    codes=np.array([np.random.choice((-1.,1.),size=functions) for i in range(classes)])
    
    

    for i in range(functions):
        x_train=train[:,:-1].copy()
        x_test=test[:,:-1].copy()
        print(i)
        y_train=np.array([codes[:,i][int(l)] for l in train[:,-1]])
        y_test=np.array([codes[:,i][int(l)] for l in test[:,-1]])
        
        m,pred=adaboost(x_train,x_test,y_train,y_test,200,False)
        
        if i==0:
            predictions=pred.reshape((len(pred),1))
        else:
            predictions=np.hstack((predictions,pred.reshape((len(pred),1))))
    
    return predictions,codes

In [14]:
def black(img):
    black=np.zeros((29,29))
    for i in range(len(img)):
        for j in range(len(img)):
            val=1 if img[i][j]==0 else 0
            black[i][j]=black[i-1][j]+black[i][j-1]-black[i-1][j-1]+val
    return black

In [15]:
def generate():
    rect=set()
    random.seed()
    while(len(rect)!=100):
        x=(random.randint(0,28),random.randint(0,28),random.randint(5,28),random.randint(5,28))
        area=x[2]*x[3]
        if(area in range(130,171) and x[0]+x[2]<28 and x[1]+x[3]<28):
            rect.add(x)
    return rect

In [16]:
def featureVal(r,black):
    x=r[0]
    y=r[1]
    l=r[2]
    w=r[3]
    f1=black[x+int(l/2)][y+w]-black[x+int(l/2)][y]-black[x][y+w]+black[x][y]\
    -(black[x+l][y+w]-black[x+int(l/2)+1][y+w]-black[x+l][y]+black[x+int(l/2)+1][y])
    f2=black[x+l][y+int(w/2)]-black[x+l][y]-black[x][y+int(w/2)]+black[x][y]\
    -(black[x+l][y+w]-black[x][y+w]-black[x+l][y+int(w/2)+1]+black[x][y+int(w/2)+1])
    return f1,f2

In [17]:
def formFeatures(black,rect):
    feat=np.asarray(list(chain.from_iterable((featureVal(r,black)) for r in rect)))
    return feat

In [37]:
black_train=np.asarray([black(img.reshape(28,28)) for img in mnist_train])

In [38]:
black_test=np.asarray([black(img.reshape(28,28)) for img in x_test])

In [42]:
arr=generate()

In [45]:
trainData=np.vstack([formFeatures(black,arr) for black in black_train])

In [46]:
testData=np.vstack([formFeatures(black,arr) for black in black_test])

In [73]:
final_train=np.hstack((trainData,train_label.reshape(-1,1)))

In [74]:
final_test=np.hstack((testData,y_test.reshape(-1,1)))

In [75]:
np.random.shuffle(final_train)

In [82]:
final_codes,codes=ecoc(final_train,final_test,10,50)

0
Round: 100 ,Feature: 23 ,Threshold: 18.5 ,Round_err: 0.46677172716053356 ,Train_err: 0.06993997999333111 ,Test_err: 0.0856 AUC: 0.9096208447303042
Round: 200 ,Feature: 76 ,Threshold: 12.5 ,Round_err: 0.46853782611101663 ,Train_err: 0.05351783927975992 ,Test_err: 0.0706 AUC: 0.9237564501437485
Round: 300 ,Feature: 4 ,Threshold: 17.5 ,Round_err: 0.5227672783467556 ,Train_err: 0.04751583861287096 ,Test_err: 0.063 AUC: 0.93018170278794
1
Round: 100 ,Feature: 28 ,Threshold: 6.5 ,Round_err: 0.5392049321379633 ,Train_err: 0.058686228742914306 ,Test_err: 0.0564 AUC: 0.9498532039489349
Round: 200 ,Feature: 13 ,Threshold: 10.5 ,Round_err: 0.528517315811917 ,Train_err: 0.04434811603867956 ,Test_err: 0.0485 AUC: 0.9567822992119028
Round: 300 ,Feature: 45 ,Threshold: -1.5 ,Round_err: 0.47165433358346326 ,Train_err: 0.039179726575525174 ,Test_err: 0.0467 AUC: 0.9591659697524842
2
Round: 100 ,Feature: 18 ,Threshold: 15.5 ,Round_err: 0.5416639000359281 ,Train_err: 0.05801933977992664 ,Test_err: 0.06

Round: 300 ,Feature: 66 ,Threshold: 4.5 ,Round_err: 0.4782755899740921 ,Train_err: 0.05626875625208403 ,Test_err: 0.0754 AUC: 0.9009392456153195
19
Round: 100 ,Feature: 146 ,Threshold: -3.5 ,Round_err: 0.5430570662425003 ,Train_err: 0.05685228409469823 ,Test_err: 0.0609 AUC: 0.9261632838177322
Round: 200 ,Feature: 20 ,Threshold: 30.5 ,Round_err: 0.46986362469742027 ,Train_err: 0.04276425475158386 ,Test_err: 0.0485 AUC: 0.9310686364107068
Round: 300 ,Feature: 21 ,Threshold: 34.5 ,Round_err: 0.47345000420546723 ,Train_err: 0.03542847615871957 ,Test_err: 0.0445 AUC: 0.9356502811741803
20
Round: 100 ,Feature: 105 ,Threshold: -14.5 ,Round_err: 0.5355229267409519 ,Train_err: 0.07994331443814605 ,Test_err: 0.0894 AUC: 0.8808270264822653
Round: 200 ,Feature: 174 ,Threshold: 4.5 ,Round_err: 0.4755825526068658 ,Train_err: 0.06527175725241748 ,Test_err: 0.0774 AUC: 0.8822124347246217
Round: 300 ,Feature: 50 ,Threshold: 24.5 ,Round_err: 0.5216876115114012 ,Train_err: 0.05735245081693898 ,Test_err:

Round: 200 ,Feature: 190 ,Threshold: 21.5 ,Round_err: 0.5292630066449069 ,Train_err: 0.06377125708569523 ,Test_err: 0.074 AUC: 0.9194167048720197
Round: 300 ,Feature: 54 ,Threshold: -3.5 ,Round_err: 0.47373361911215817 ,Train_err: 0.0559353117705902 ,Test_err: 0.0675 AUC: 0.9239489239227539
38
Round: 100 ,Feature: 130 ,Threshold: 10.5 ,Round_err: 0.46586277180913926 ,Train_err: 0.09853284428142714 ,Test_err: 0.1066 AUC: 0.8917101495956807
Round: 200 ,Feature: 46 ,Threshold: -3.5 ,Round_err: 0.4733341812545086 ,Train_err: 0.07810936978992998 ,Test_err: 0.0936 AUC: 0.902678869189246
Round: 300 ,Feature: 5 ,Threshold: -5.5 ,Round_err: 0.4818714915204212 ,Train_err: 0.06910636878959653 ,Test_err: 0.0868 AUC: 0.9099000964630924
39
Round: 100 ,Feature: 62 ,Threshold: -8.5 ,Round_err: 0.45246099382639 ,Train_err: 0.057685895298432814 ,Test_err: 0.0643 AUC: 0.914403657132149
Round: 200 ,Feature: 161 ,Threshold: -5.5 ,Round_err: 0.47251999749623946 ,Train_err: 0.046098699566522176 ,Test_err: 0.

In [83]:
def accuracy(actual,pred):
    return np.mean(actual==pred)

In [85]:
for i in codes:
    print(np.sum(i==codes,axis=1))

[50 23 21 27 25 23 31 26 24 25]
[23 50 28 26 24 30 22 27 29 26]
[21 28 50 28 16 24 22 25 27 28]
[27 26 28 50 22 34 28 29 29 22]
[25 24 16 22 50 26 30 29 21 24]
[23 30 24 34 26 50 24 25 25 20]
[31 22 22 28 30 24 50 31 23 26]
[26 27 25 29 29 25 31 50 26 25]
[24 29 27 29 21 25 23 26 50 21]
[25 26 28 22 24 20 26 25 21 50]


In [87]:
preds=np.argmax(cos(final_codes,codes),axis=1)
print("Accuracy on Digits dataset using ECOC is: ",accuracy(final_test[:,-1],preds))

Accuracy on Digits dataset using ECOC is:  0.9255
