# Übung 4: Fisher-Klassifikator - Rainier Robles & Valentin Wolf

In [284]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Import Data & split into train/test sets

In [760]:
data = pd.read_table('spam.data', delim_whitespace=True,header=None)

from sklearn.utils import shuffle
data = shuffle(data,random_state=1338)

N = data.shape[0]
split = 0.2
y_train = data.iloc[int(split*N):, -1].as_matrix()
X_train = data.iloc[int(split*N):, :-1].as_matrix()

y_test = data.iloc[:int(split*N), -1].as_matrix()
X_test = data.iloc[:int(split*N), :-1].as_matrix()

In [761]:
y_train

array([0, 0, 0, ..., 1, 1, 0])

In [762]:
X_train

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.70000000e+00,   1.20000000e+01,   6.80000000e+01],
       [  0.00000000e+00,   0.00000000e+00,   7.40000000e-01, ...,
          2.58700000e+00,   5.50000000e+01,   2.82000000e+02],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          2.10200000e+00,   1.20000000e+01,   8.20000000e+01],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          4.15700000e+00,   1.30000000e+01,   7.90000000e+01],
       [  5.00000000e-02,   5.00000000e-02,   4.00000000e-01, ...,
          4.90600000e+00,   9.50000000e+01,   1.31000000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.25000000e+00,   2.00000000e+00,   5.00000000e+00]])

In [763]:
def is_invertible(X):
    return np.linalg.cond(X) < 1 / np.spacing(1)

### Define the Classifier

In [764]:
class Classifier(object):
    def error_rate(self,truth, pred):
        """gets two vectors, returns (wrongly classified / total)"""
        return 1 - self.accuracy(truth, pred)
    
    def accuracy(self,truth,pred):
        return np.mean(truth == pred)
    
def confused_matrix(x,y,percentage=False):
    assert(x.shape == y.shape)
    num_entries = x.shape[0]
    dim = len(np.unique(x))
    dim2 = len(np.unique(y))
    matrix = np.zeros((dim,dim2))
    for i in range(num_entries):
        matrix[int(x[i]),int(y[i])] += 1
    if percentage==True: 
        matrix /= np.sum(matrix,axis=1)
    return matrix


In [765]:
class FisherClassifier(Classifier):
    def fit(self,X,y):
        self.classes = np.unique(y)
        assert(len(np.unique(y)) == 2)
        
        self.alpha = self.calc_alpha(X,y)
        
        X1 = X[y == self.classes[0]]
        X2 = X[y == self.classes[1]]
        E1 = self.cov_matrix(X1)
        E2 = self.cov_matrix(X2)
        self.m1 = self.proj(self.center(X1))#np.dot(self.alpha.T,m1)
        self.m2 = self.proj(self.center(X2))#np.dot(self.alpha.T,m2)
        self.var1 = self.alpha.T.dot(E1).dot(self.alpha)#np.mean((self.proj(X1)-self.m1)**2)
        self.var2 = self.alpha.T.dot(E2).dot(self.alpha)
        self.aprio1 = X1.shape[0]/X.shape[0]
        self.aprio2 = X2.shape[0]/X.shape[0]
        
    def calc_alpha(self,X,y):
        classes_count = len(self.classes)
        features = X.shape[1]
        X1 = X[y == self.classes[0]]
        X2 = X[y == self.classes[1]]
        E1 = self.cov_matrix(X1)
        E2 = self.cov_matrix(X2)
        m1 = self.center(X1), 
        m2 = self.center(X2)
        alpha = np.linalg.pinv(np.add(E1,E2)).dot(np.subtract(m1,m2)[0])
        #normalize and return
        return alpha/np.linalg.norm(alpha)
        
    def proj(self,b):
        return np.dot(self.alpha.T,b)
        
    def cov_matrix(self,X):
        return np.cov(X.T,bias = True)
        """Returns Covariance matrix of matrix X; X should be of the same class"""
        N = X.shape[0]
        m = self.center(X)
        X -= m
        cov = 1/N * np.sum((X[...,None] * X[:,None]),axis=0) 
        #https://stackoverflow.com/questions/40413000/column-by-row-multiplication-in-numpy
        return cov
        
    def center(self,X):
        return np.mean(X,axis=0)
    
    def predict(self,X):
        classes_count = len(self.classes)
        preds = np.zeros((X.shape[0]))#,2))
        probs = np.zeros((X.shape[0],2))
        X = self.proj(X.T)
        self.X = X
        for i in range(X.shape[0]):
            prob1 = self.aprio1*self.norm_dist(self.var1,self.m1,X[i])
            prob2 = self.aprio2*self.norm_dist(self.var2,self.m2,X[i])
            #score[i] = X[i]#np.dot(self.alpha.T,X[i])
            preds[i] = np.argmax((prob1,prob2))
            probs[i] = (prob1,prob2)
        return preds,probs
    
    def norm_dist(self, var, m, x):
        divisor = 1 / np.sqrt(2*np.pi+var)
        exponent = -0.5 * (x - m)**2 / var
        return divisor * (np.exp(exponent))
            
                
        

In [766]:
cls = FisherClassifier()
cls.fit(X_train,y_train)

In [767]:
pred,probs = cls.predict(X_test)
        
print(1, 'vs.', 0, 
      'Accuracy:', round(cls.accuracy(y_test,pred),4), 
      'Error rate:', round(cls.error_rate(y_test,pred),4))
confused_matrix(y_test,pred,percentage=False)

1 vs. 0 Accuracy: 0.9011 Error rate: 0.0989


array([[ 509.,   29.],
       [  62.,  320.]])

In [772]:
X_test[y_test!=pred]

array([[   2.43 ,    0.   ,    0.   , ...,    3.666,   13.   ,   44.   ],
       [   0.   ,    0.   ,    0.   , ...,    2.376,   41.   ,  492.   ],
       [   0.   ,    0.   ,    0.   , ...,    2.3  ,    9.   ,   23.   ],
       ..., 
       [   0.   ,    0.95 ,    0.95 , ...,    4.18 ,   45.   ,  464.   ],
       [   1.17 ,    0.   ,    0.   , ...,    1.966,   10.   ,   59.   ],
       [   0.   ,    0.   ,    0.   , ...,    2.132,   22.   ,  113.   ]])

In [1]:
import matplotlib.ticker as ticker
x = list(np.arange(-2.5, 1.8, 0.01))
y1 = list(map(lambda x: cls.aprio1*cls.norm_dist(cls.var1,cls.m1,x), x))
y2 = list(map(lambda x: cls.aprio2*cls.norm_dist(cls.var2,cls.m2,x), x))
dims = (12, 6)
fig, ax = plt.subplots(figsize=dims)
g= sns.pointplot(x,y1,markers='',color='green')
g= sns.pointplot(x,y2,markers='',color='red')
ax.xaxis.set_major_locator(ticker.MultipleLocator(50))

NameError: name 'np' is not defined