# Projet MALAP

## Réalisé par Tong Zhao, Quentin Duchemin et Pierre Boyeau

In [45]:
import  numpy as np
import matplotlib.pyplot as plt
from tools import *
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from scipy import stats
from sklearn.svm import SVC

In [46]:
def load_usps(filename):
    with open(filename,"r") as f:
        f.readline()
        data =[ [float(x) for x in l.split()] for l in f if len(l.split())>2]
    tmp = np.array(data)
    return tmp[:,1:],tmp[:,0].astype(int)

def get_usps(l,datax,datay):
    """ l : liste des chiffres a extraire"""
    if type(l)!=list:
        resx = datax[datay==l,:]
        resy = datay[datay==l]
        return resx,resy
    tmp =  list( zip(*[get_usps(i,datax,datay) for i in l]))
    tmpx,tmpy = np.vstack(tmp[0]),np.hstack(tmp[1])
    idx = np.random.permutation(range(len(tmpy)))
    return tmpx[idx,:],tmpy[idx]

def show_usps(data):
    plt.imshow(data.reshape((16,16)),interpolation="nearest",cmap="gray")

## Data processing

In [47]:
# Définition de la base d'apprentissage
xuspstrain,yuspstrain = load_usps("datas/usps/USPS_train.txt")
xuspstrain,yuspstrain = get_usps([0, 1, 5, 8, 3],xuspstrain,yuspstrain)
data = xuspstrain[:1800,:]
yuspstrain = yuspstrain[:1800]
(n,m) = np.shape(data)

# shuffle data
idx = np.random.permutation(n)
data = data[idx]
yuspstrain = yuspstrain[idx]

# Pourcentage de la base d'apprentissage non étiquettée
percent_u = 50./100.
# nombre de données non étiquettées
U = int(percent_u * n)
# nombre de données étiquettées
L =  n-U

# étiquettes connues
labels = yuspstrain[:L]
# étiquettes à prédire
labpredire = yuspstrain[L:]

print("Load data...\n")
print("Data shape: %d * %d" % data.shape)
print("\nLabeled num: %d" % L)
print("\nUnlabeld num: %d" % U)

Load data...

Data shape: 1800 * 256

Labeled num: 900

Unlabeld num: 900


## Baseline 1 - K plus proche voisins 

In [48]:
class KNN:
    
    def __init__(self, k):
        self.k = k
        
    def fit(self, data, labels):
        self.N = data.shape[0]
        L = labels.shape[0]
        
        # calculate distance
        self.W = cosine_distances(data[L:], data[:L])
        self.labels = labels
        
    def predict(self):
        
        idx = np.argsort(self.W, axis = 1)[:, :self.k]
        bag = self.labels[idx]
        predict, _ = stats.mode(bag, axis = 1)
        
        return predict.reshape((-1))
    
    def score(self, labels):
        return (self.predict()==labels).mean()

In [49]:
M = KNN(10)
M.fit(data,labels)
print("The score of Diffusion by knn is %f" % M.score(labpredire))

The score of Diffusion by knn is 0.937778


## Baseline 2 - SVM

In [50]:
clf = SVC(C=1.5)
clf.fit(data[:U], labels)
print("The score of SVM is %f" % (clf.predict(data[U:]) == labpredire).mean())

The score of SVM is 0.964444


## 1. Classification using simply threshold

$$ f_u = (D_{uu} - W_{uu})^{-1} W_{ul} f_l$$

In [51]:
class DiffusionTRESH:
        
    def predict(self):
        
        # record label
        set_labels = np.unique(self.labels)

        y = (np.tile(self.labels.reshape((-1, 1)), (1, set_labels.shape[0])) == set_labels).astype(int)
        fu = np.dot(np.dot(np.linalg.inv(self.D[self.L:,self.L:] - self.W[self.L:,self.L:]),self.W[self.L:,:self.L]),y)
        prediction = set_labels[np.argmax(fu, axis = 1)]
        return prediction          
            
    def fit(self, data, labels):
        self.labels = labels
        self.L = labels.shape[0]
        sigmas = np.array([2.5 for i in range(m)])
        
        # calculate W
        data_n = data / sigmas
        self.W = np.exp(-euclidean_distances(data_n)**2)
        
        # calculate D
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
        
    def score(self, labels):
        return (self.predict()==labels).mean()


In [52]:
M = DiffusionTRESH()
M.fit(data,labels)
print("The score of Diffusion by threshold is %f" % M.score(labpredire))

The score of Diffusion by threshold is 0.960000



## 2. Incorporation of Class Prior : CMN with weights fixed by advance

In [53]:
class DiffusionCMN:

    def predict(self):
        
        # record label
        set_labels = np.unique(self.labels)
        
        # calculate fu
        y = (np.tile(self.labels.reshape((-1, 1)), (1, set_labels.shape[0])) == set_labels).astype(int)
        desirable_proportions = y.sum(0) + 1
        fu = np.dot(np.dot(np.linalg.inv(self.D[self.L:,self.L:]-self.W[self.L:,self.L:]),self.W[self.L:,:self.L]),y)
        fu = fu * (desirable_proportions / fu.sum(0))

        prediction = set_labels[np.argmax(fu, axis = 1)]
        return prediction          
            
    def fit(self,data,labels):
        self.labels = labels
        self.L = labels.shape[0]
        sigmas = np.array([2.5 for i in range(m)])
        
        # calculate W
        data_n = data / sigmas
        self.W = np.exp(-euclidean_distances(data_n)**2)
        
        # calculate D
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
    def score(self,labels):
        return (self.predict()==labels).mean()

In [54]:
M = DiffusionCMN()
M.fit(data,labels)
print("The score of Diffusion by class prior is %f" % M.score(labpredire))

The score of Diffusion by class prior is 0.962222


## 3. Incorporating External Classifiers - SVM

In [55]:
class DiffusionSVM:

    def predict(self):
        
        # record label
        set_labels = np.unique(self.labels)
        
        # calculate fu
        y = (np.tile(self.labels.reshape((-1, 1)), (1, set_labels.shape[0])) == set_labels).astype(int)
        fu = np.dot(np.dot(np.linalg.inv(self.D[self.L:,self.L:]-self.W[self.L:,self.L:]),self.W[self.L:,:self.L]),y)
        fu = fu * self.hu
        
        prediction = set_labels[np.argmax(fu, axis = 1)]
        return prediction          
            
    def fit(self,data,labels):
        self.labels = labels
        self.L = labels.shape[0]
        sigmas = np.array([2.5 for i in range(m)])
        
        # calculate W
        data_n = data / sigmas
        self.W = np.exp(-euclidean_distances(data_n)**2)
        
        # calculate D
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
        # train svm
        clf = SVC(C=2, probability = True)
        clf.fit(data[:self.L], labels)
        self.hu = clf.predict_proba(data[L:])
        
    def score(self,labels):
        return (self.predict()==labels).mean()

In [56]:
M = DiffusionSVM()
M.fit(data,labels)
print("The score of Diffusion by class prior is %f" % M.score(labpredire))

The score of Diffusion by class prior is 0.970000


# 4. Learning W

In [64]:
class Diffusion_learn:
    def __init__(self, eps=1):
        self.eps = eps

        self.data = None
        self.U = None
        self.L = None
        self.W = None
        self.D = None
        self.labels = None
        self.Umat = None

    def oracle(self, sgm, compute_grad=True):
        """To be used, self.W, self.D and self.labels must be properly defined

        gradfu defined such as its elements are defined by
        a_{ij} = \frac{\part f(i)}{\part \sigma_j}
        
        
        
        A GERER : MULTIDIMENTIONNALITE"""
        data_sgm = self.data / sgm
        self.W = np.exp(-euclidean_distances(data_sgm) ** 2)

        # calculate D
        self.D = np.diag(np.sum(self.W, axis=1))

        # Computing estimation of H(sgm)
        set_labels = np.unique(self.labels)
        y = (np.tile(self.labels.reshape((-1, 1)), (1, set_labels.shape[0])) == set_labels).astype(int)

        P = np.dot(np.linalg.inv(self.D), self.W)
        fu = np.dot(np.dot(np.linalg.inv(np.eye(self.U) - P[self.L:, self.L:]), P[self.L:, :self.L]), y)

        if compute_grad:
            Psmooth = self.eps * self.Umat + (1 - self.eps) * P
            firstmat = np.linalg.inv(np.eye(self.U) - Psmooth[self.L:, self.L:])
            gradfu = np.zeros((self.U, self.d))
            for ind in range(self.d):
                # objective = Computing derivative of Psmooth
                Xid = np.tile(self.data[:, ind], (self.U + self.L, 1))  # Attention sens
                partw = 2 * (self.W * ((Xid - Xid.T) ** 2)) / (sgm[ind] ** 3)
                sum_w = np.tile(np.sum(self.W, axis=1), (self.U + self.L, 1)).T
                sum_partw = np.tile(np.sum(partw, axis=1), (self.U + self.L, 1)).T

                partPsmooth = (partw - P * sum_partw) / (sum_w)
                secondmat = np.dot(partPsmooth[self.L:, self.L:], fu) + np.dot(partPsmooth[self.L:, :self.L], y)

                # Computing value of gradfu[ind]
                gradfu[:, ind] = np.dot(firstmat, secondmat)

            return fu, gradfu

        else:
            return fu

    def H(self, sgm):
        fu = self.oracle(sgm, compute_grad=False)
        res = -np.multiply(fu, np.log(fu)) - np.multiply(1 - fu, np.log(1 - fu))
        return (1 / self.U) * np.sum(res)

    def gradH(self, sgm):
        fu, gradf = self.oracle(sgm)

        partial = np.divide(1 - fu, fu)
        # Je fais une boucle pour l'instant car peur de faire une erreur. A REMPLACER
        res = np.zeros((self.d, 1))
        for ind in range(self.d):
            res[ind] = (1 / self.U) * np.sum(np.multiply(partial, gradf[:, ind]))
        return res

    def fit(self, data, labels):
        self.labels = labels
        self.L = labels.shape[0]
        self.U = data.shape[0] - self.L
        self.d = data.shape[1]
        self.data = data.copy()

        # Computing Umat (necessary for learning W)
        self.Umat = (1 / (self.L + self.U)) * np.ones((self.L + self.U))

        pass

    def predict(self):
        """A FAIRE"""
        # Learning
        pass

In [65]:
np.shape(data)

(1800, 256)

In [66]:
np.shape(data)[0] - L

900