# Projet MALAP

## Réalisé par Tong Zhao, Quentin Duchemin et Pierre Boyeau

In [79]:
import  numpy as np
import matplotlib.pyplot as plt
from tools import *

def load_usps(filename):
    with open(filename,"r") as f:
        f.readline()
        data =[ [float(x) for x in l.split()] for l in f if len(l.split())>2]
    tmp = np.array(data)
    return tmp[:,1:],tmp[:,0].astype(int)

def get_usps(l,datax,datay):
    """ l : liste des chiffres a extraire"""
    if type(l)!=list:
        resx = datax[datay==l,:]
        resy = datay[datay==l]
        return resx,resy
    tmp =  list( zip(*[get_usps(i,datax,datay) for i in l]))
    tmpx,tmpy = np.vstack(tmp[0]),np.hstack(tmp[1])
    idx = np.random.permutation(range(len(tmpy)))
    return tmpx[idx,:],tmpy[idx]

def show_usps(data):
    plt.imshow(data.reshape((16,16)),interpolation="nearest",cmap="gray")
    
xuspstrain,yuspstrain = load_usps("USPS_train.txt")
xuspstest,yuspstest = load_usps("USPS_test.txt")
x06train,y06train = get_usps([0,6],xuspstrain,yuspstrain)
x06test,y06test = get_usps([0,6],xuspstest,yuspstest)

## 1. Classification using simply threshold

In [91]:
class DiffusionTRESH:
    def __init__(self,percent_u):
        self.percent_u = percent_u    
    def predict(self,data):
        (n,m) = np.shape(data)
        U = int(self.percent_u * n)
        L = n-U
        set_labels = list(set(self.labels))
        y = np.zeros((L,len(set_labels)))
        label_to_ind = {}
        for j in range(len(set_labels)):
            label_to_ind[set_labels[j]] = j
        for i in range(L):
            y[i,label_to_ind[self.labels[i]]] = 1
        fu = np.dot(np.dot(np.linalg.inv(self.D[L:,L:]-self.W[L:,L:]),self.W[L:,:L]),y)
        prediction = np.ones(U)
        # application de la procédure : Threshold pour la classification
        for i in range(U):
            prediction[i] = set_labels[np.argmax(fu[i])]
        return prediction          
            
    def fit(self,data,labels):
        self.labels = labels
        self.W = np.ones((n,n))
        sigmas = np.array([5 for i in range(m)])
        for i in range(n):
            for j in range(i+1,n):
                poids = np.exp(-np.sum((data[i,:]-data[j,:])**2/sigmas**2))
                self.W[i,j] = poids
                self.W[j,i] = poids
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
    def score(self,data,labels):
        return (self.predict(data)==labels).mean()


In [92]:
# Définition de la base d'apprentissage
xuspstrain,yuspstrain = load_usps("USPS_train.txt")
xuspstrain,yuspstrain = get_usps([0,8],xuspstrain,yuspstrain)
data = xuspstrain[:1800,:]
yuspstrain = yuspstrain[:1800]
(n,m) = np.shape(data)

# Pourcentage de la base d'apprentissage non étiquettée
percent_u = 50./100.
# nombre de données non étiquettées
U = int(percent_u * n)
# nombre de données étiquettées
L =  n-U

# étiquettes connues
labels = yuspstrain[:L]
# étiquettes à prédire
labpredire = yuspstrain[L:]


M = DiffusionTRESH(percent_u)
M.fit(data,labels)
print(M.score(data,labpredire))

0.98732718894



## 2. Incorporation of Class Prior : CMN with weights fixed by advance

In [93]:
class DiffusionCMN:
    def __init__(self,percent_u):
        self.percent_u = percent_u    
    def predict(self,data):
        (n,m) = np.shape(data)
        U = int(self.percent_u * n)
        L = n-U
        set_labels = list(set(self.labels))
        y = np.zeros((L,len(set_labels)))
        
        label_to_ind = {}
        for j in range(len(set_labels)):
            label_to_ind[set_labels[j]] = j
        for i in range(L):
            y[i,label_to_ind[self.labels[i]]] = 1
        fu = np.dot(np.dot(np.linalg.inv(self.D[L:,L:]-self.W[L:,L:]),self.W[L:,:L]),y)
        prediction = np.ones(U)
        # application de la procédure : Class Mass Normalization
        desirable_proportions = []
        for lab in set_labels:
            desirable_proportions.append(np.sum(self.labels == lab)/len(set_labels))
        for i in range(U):
            CMN = []
            for j in range(len(set_labels)):
                CMN.append(desirable_proportions[j] * fu[i,j] /np.sum(fu[i]))
            prediction[i] = set_labels[np.argmax(CMN)]
        return prediction          
            
    def fit(self,data,labels):
        self.labels = labels
        self.W = np.ones((n,n))
        sigmas = np.array([5 for i in range(m)])
        for i in range(n):
            for j in range(i+1,n):
                poids = np.exp(-np.sum((data[i,:]-data[j,:])**2/sigmas**2))
                self.W[i,j] = poids
                self.W[j,i] = poids
        diago = np.sum(self.W,axis=1)
        self.D = np.diag(diago)
        
    def score(self,data,labels):
        return (self.predict(data)==labels).mean()



In [94]:
# Définition de la base d'apprentissage
xuspstrain,yuspstrain = load_usps("USPS_train.txt")
xuspstrain,yuspstrain = get_usps([0,8],xuspstrain,yuspstrain)
data = xuspstrain[:1800,:]
yuspstrain = yuspstrain[:1800]
(n,m) = np.shape(data)

# Pourcentage de la base d'apprentissage non étiquettée
percent_u = 50./100.
# nombre de données non étiquettées
U = int(percent_u * n)
# nombre de données étiquettées
L =  n-U

# étiquettes connues
labels = yuspstrain[:L]
# étiquettes à prédire
labpredire = yuspstrain[L:]


M = DiffusionCMN(percent_u)
M.fit(data,labels)
print(M.score(data,labpredire))

0.966589861751
