# Projet MALAP

## Réalisé par Tong Zhao, Quentin Duchemin et Pierre Boyeau

In [57]:
import  numpy as np
import matplotlib.pyplot as plt
from tools import *
from sklearn.metrics.pairwise import cosine_distances
from scipy import stats

In [47]:
def load_usps(filename):
    with open(filename,"r") as f:
        f.readline()
        data =[ [float(x) for x in l.split()] for l in f if len(l.split())>2]
    tmp = np.array(data)
    return tmp[:,1:],tmp[:,0].astype(int)

def get_usps(l,datax,datay):
    """ l : liste des chiffres a extraire"""
    if type(l)!=list:
        resx = datax[datay==l,:]
        resy = datay[datay==l]
        return resx,resy
    tmp =  list( zip(*[get_usps(i,datax,datay) for i in l]))
    tmpx,tmpy = np.vstack(tmp[0]),np.hstack(tmp[1])
    idx = np.random.permutation(range(len(tmpy)))
    return tmpx[idx,:],tmpy[idx]

def show_usps(data):
    plt.imshow(data.reshape((16,16)),interpolation="nearest",cmap="gray")

## Data processing

In [161]:
# Définition de la base d'apprentissage
xuspstrain,yuspstrain = load_usps("datas/usps/USPS_train.txt")
xuspstrain,yuspstrain = get_usps([8,3],xuspstrain,yuspstrain)
data = xuspstrain[:1800,:]
yuspstrain = yuspstrain[:1800]
(n,m) = np.shape(data)

# shuffle data
idx = np.random.permutation(n)
data = data[idx]
yuspstrain = yuspstrain[idx]

# Pourcentage de la base d'apprentissage non étiquettée
percent_u = 50./100.
# nombre de données non étiquettées
U = int(percent_u * n)
# nombre de données étiquettées
L =  n-U

# étiquettes connues
labels = yuspstrain[:L]
# étiquettes à prédire
labpredire = yuspstrain[L:]

print("Load data...\n")
print("Data shape: %d * %d" % data.shape)
print("\nLabeled num: %d" % L)
print("\nUnlabeld num: %d" % U)

Load data...

Data shape: 1200 * 256

Labeled num: 600

Unlabeld num: 600


## Baseline - K plus proche voisins 

In [153]:
class KNN:
    
    def __init__(self, k):
        self.k = k
        
    def fit(self, data, labels):
        self.N = data.shape[0]
        L = labels.shape[0]
        
        # calculate distance
        self.W = cosine_distances(data[L:], data[:L])
        self.labels = labels
        
    def predict(self):
        
        idx = np.argsort(self.W, axis = 1)[:, :self.k]
        bag = self.labels[idx]
        predict, _ = stats.mode(bag, axis = 1)
        
        return predict
    
    def score(self, labels):
        return (self.predict()==labels.reshape((-1, 1))).mean()

In [157]:
M = KNN(10)
M.fit(data,labels)
print("The score of Diffusion by knn is %f" % M.score(labpredire))

The score of Diffusion by knn is 0.960000


## 1. Classification using simply threshold

In [158]:
class DiffusionTRESH:
    def __init__(self,percent_u):
        self.percent_u = percent_u
        
    def predict(self, data):
        n, m = data.shape
        U = int(self.percent_u * n)
        L = n-U
        set_labels = np.unique(self.labels)
        y = np.zeros((L,len(set_labels)))
        label_to_ind = {}
        for j in range(len(set_labels)):
            label_to_ind[set_labels[j]] = j
        for i in range(L):
            y[i,label_to_ind[self.labels[i]]] = 1
        fu = np.dot(np.dot(np.linalg.inv(self.D[L:,L:]-self.W[L:,L:]),self.W[L:,:L]),y)
        prediction = np.ones(U)
        # application de la procédure : Threshold pour la classification
        for i in range(U):
            prediction[i] = set_labels[np.argmax(fu[i])]
        return prediction          
            
    def fit(self, data, labels):
        self.labels = labels
        n, m = data.shape
        self.W = np.ones((n,n))
        sigmas = np.array([5 for i in range(m)])
        for i in range(n):
            for j in range(i+1,n):
                poids = np.exp(-np.sum((data[i,:]-data[j,:])**2/sigmas**2))
                self.W[i,j] = poids
                self.W[j,i] = poids
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
    def score(self,data,labels):
        return (self.predict(data)==labels).mean()


In [159]:
M = DiffusionTRESH(percent_u)
M.fit(data,labels)
print("The score of Diffusion by threshold is %f" % M.score(data,labpredire))

The score of Diffusion by threshold is 0.941667



## 2. Incorporation of Class Prior : CMN with weights fixed by advance

In [55]:
class DiffusionCMN:
    def __init__(self,percent_u):
        self.percent_u = percent_u
        
    def predict(self,data):
        (n,m) = np.shape(data)
        U = int(self.percent_u * n)
        L = n-U
        set_labels = np.unique(self.labels)
        y = np.zeros((L,len(set_labels)))
        
        label_to_ind = {}
        for j in range(len(set_labels)):
            label_to_ind[set_labels[j]] = j
        for i in range(L):
            y[i,label_to_ind[self.labels[i]]] = 1
        fu = np.dot(np.dot(np.linalg.inv(self.D[L:,L:]-self.W[L:,L:]),self.W[L:,:L]),y)
        prediction = np.ones(U)
        # application de la procédure : Class Mass Normalization
        desirable_proportions = []
        for lab in set_labels:
            desirable_proportions.append(np.sum(self.labels == lab)/ float(len(labels)))
        for i in range(U):
            CMN = []
            for j in range(len(set_labels)):
                CMN.append(desirable_proportions[j] * fu[i,j] / np.sum(fu[i]))
            prediction[i] = set_labels[np.argmax(CMN)]
        return prediction          
            
    def fit(self,data,labels):
        (n,m) = np.shape(data)
        self.labels = labels
        self.W = np.ones((n,n))
        sigmas = np.array([5 for i in range(m)])
        for i in range(n):
            for j in range(i+1,n):
                poids = np.exp(-np.sum((data[i,:]-data[j,:])**2/sigmas**2))
                self.W[i,j] = poids
                self.W[j,i] = poids
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
    def score(self,data,labels):
        return (self.predict(data)==labels).mean()

In [156]:
M = DiffusionCMN(percent_u)
M.fit(data,labels)
print("The score of Diffusion by class prior is %f" % M.score(data,labpredire))

The score of Diffusion by class prior is 0.938333
