# TP6 : Reconnaissance d'une diagonale dans carré 2x2

### Chargement des librairies

In [11]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

Chargement des données

In [70]:
# Ensemble des données non linéairement séparable:
# Lecture des données
data=pd.read_csv("diag2x2.csv",sep=",")
print(data)

    X1  X2  X3  X4  Y
0    0   0   0   0  0
1    0   0   0   1  0
2    0   0   1   0  0
3    0   0   1   1  0
4    0   1   0   0  0
5    0   1   0   1  0
6    0   1   1   0  1
7    0   1   1   1  0
8    1   0   0   0  0
9    1   0   0   1  1
10   1   0   1   0  0
11   1   0   1   1  0
12   1   1   0   0  0
13   1   1   0   1  0
14   1   1   1   0  0
15   1   1   1   1  0


In [71]:
# Création d'un dataframe
DF = pd.DataFrame(data, columns = ['X1','X2','X3','X4','Y'])
# Affichage des 5 premières données
DF.head()

Unnamed: 0,X1,X2,X3,X4,Y
0,0,0,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,1,1,0
4,0,1,0,0,0


In [79]:
# Définition des entrées X et de la sortie désirée Y
Xdata=DF.drop(['Y'],axis=1) # Xdata = toutes les colonnes sauf la colonne Y
Ydata=DF['Y'] # Ydata = la colonne Y
n=len(Xdata) # Nombre de ligne
d=len(Xdata.iloc[1]) # Nombre de colonnes de X

### Réseau à 1 couche cachée de 2 neurones
On tente d'approximer $X\mapsto Y$ par la fonction
$$X=(x_1,x_2,x_3,x_4)\mapsto \sigma(b+w_1 \sigma(h_1(X))+w_2\sigma(h_2(X)))$$ avec $$h_1(X)=b_1+w_{11}x_1+w_{21}x_2+w_{31}x_3+w_{41}x_4$$
et $$h_2(X)=b_2+w_{12}x_1+w_{22}x_2+w_{32}x_3+w_{42}x_4$$

En particulier, $w_{k\ell}$ est le poids situé de $x_\ell$ vers $\sigma(h_k)$

In [82]:
Xdata.iloc[1]

X1    0
X2    0
X3    0
X4    1
Name: 1, dtype: int64

In [116]:
# Réseau
def sigma(x):
    return 1/(1+np.exp(-x))
def R(W,W1,W2,j):
    b,w1,w2=W[0],W[1],W[2]
    b1,w11,w12,w13,w14=W1[0],W1[1],W1[2],W1[3],W1[4]
    b2,w21,w22,w23,w24=W2[0],W2[1],W2[2],W2[3],W2[4]
    Xj=Xdata.iloc[j]
    h1=b1+np.dot([w11,w12,w13,w14],Xj)
    h2=b2+np.dot([w21,w22,w23,w24],Xj)
    H=b+np.dot([w1,w2],[sigma(h1),sigma(h2)])
    return sigma(H)

In [432]:
# Gradient par rétropropagation
def gradE(W,W1,W2,j):
    b,w1,w2=W[0],W[1],W[2]
    b1,w11,w12,w13,w14=W1[0],W1[1],W1[2],W1[3],W1[4]
    b2,w21,w22,w23,w24=W2[0],W2[1],W2[2],W2[3],W2[4]
    Xj=Xdata.iloc[j]
    Yj=Ydata.iloc[j]
    h1=b1+w11*Xj[0]+w12*Xj[1]+w13*Xj[2]+w14*Xj[3]
    h2=b2+w21*Xj[0]+w22*Xj[1]+w23*Xj[2]+w24*Xj[3]    
    H=b+w1*sigma(h1)+w2*sigma(h2)
    sH=sigma(H)
    sh1=sigma(h1)
    sh2=sigma(h2)
    dsH=sH*(1-sH)
    dsh1=sh1*(1-sh1)
    dsh2=sh2*(1-sh2)
    # Calcul des dérivées partielles couche par couche
    # E=0.5(sigma(H)-Yj)^2
    dEdsH=sH-Yj # d(E)/d(sigma(H))=(sigma(H)-Yj)*1
    dsHdb=dsH*1            # d(sigma(H))/db=sigma'(H)*1
    dsHdw1=dsH*sh1           # d(sigma(H))/dw1=sigma'(H)*sigma(h1)
    dsHdw2=dsH*sh2           # d(sigma(H))/dw2=sigma'(H)*sigma(h2)
    dsHdsh1=dsH*w1           # d(sigma(H))/d(sigma(h1))=sigma'(H)*w1
    dsHdsh2=dsH*w2           # d(sigma(H))/d(sigma(h2))=sigma'(H)*w2
    dsh1db1=dsh1*1          # d(sigma(h1))/db1=sigma'(h1)*1
    dsh1dw11=dsh1*Xj[0]        # d(sigma(h1))/dw11=sigma'(h1)*x1
    dsh1dw12=dsh1*Xj[1]        # d(sigma(h1))/dw12=sigma'(h1)*x2
    dsh1dw13=dsh1*Xj[2]        # d(sigma(h1))/dw13=sigma'(h1)*x3
    dsh1dw14=dsh1*Xj[3]        # d(sigma(h1))/dw14=sigma'(h1)*x4
    dsh2db2=dsh2*1          # d(sigma(h2))/db2=sigma'(h2)*1
    dsh2dw21=dsh2*Xj[0]        # d(sigma(h2))/dw21=sigma'(h2)*x1
    dsh2dw22=dsh2*Xj[1]        # d(sigma(h2))/dw22=sigma'(h2)*x2
    dsh2dw23=dsh2*Xj[2]        # d(sigma(h2))/dw23=sigma'(h2)*x3
    dsh2dw24=dsh2*Xj[3]        # d(sigma(h2))/dw24=sigma'(h2)*x4
    dEdb=dEdsH*dsHdb
    dEdw1=dEdsH*dsHdw1
    dEdw2=dEdsH*dsHdw2
    dEdb1=dEdsH*dsHdsh1*dsh1db1
    dEdw11=dEdsH*dsHdsh1*dsh1dw11
    dEdw12=dEdsH*dsHdsh1*dsh1dw12
    dEdw13=dEdsH*dsHdsh1*dsh1dw13
    dEdw14=dEdsH*dsHdsh1*dsh1dw14
    dEdb2=dEdsH*dsHdsh2*dsh2db2
    dEdw21=dEdsH*dsHdsh2*dsh2dw21
    dEdw22=dEdsH*dsHdsh2*dsh2dw22
    dEdw23=dEdsH*dsHdsh2*dsh2dw23
    dEdw24=dEdsH*dsHdsh2*dsh2dw24
    return [[dEdb,dEdw1,dEdw2],[dEdb1,dEdw11,dEdw12,dEdw13,dEdw14],[dEdb2,dEdw21,dEdw22,dEdw23,dEdw24]]

In [271]:
print(R([1,0,0],[1,0,0,0,0],[1,0,0,0,0],5))
print(gradE([1,1,1],[1,1,1,1,1],[1,1,1,1,1],15))

0.7310585786300049
[[0.04353051773376169, 0.04323917446791269, 0.04323917446791269], [0.0002773488044401406, 0.0002893933488028782, 0.0002893933488028782, 0.0002893933488028782, 0.0002893933488028782], [0.0002893933488028782, 0.0002893933488028782, 0.0002893933488028782, 0.0002893933488028782, 0.0002893933488028782]]


In [501]:
def descente(W,W1,W2,tau=0.01,tolerance=1e-2,nbiterations=1000):
    diverge=False
    for i in range(nbiterations):
        g=[[0,0,0],[0,0,0,0,0],[0,0,0,0,0]]
        batch=np.random.randint(0, 16, 5) # 5 entiers aléatoires entre 0 et 15
        for j in batch: # j parcourt 5 données aléatoires
            gg=gradE(W,W1,W2,j)
            g = [np.add(g[0],gg[0]),np.add(g[1],gg[1]),np.add(g[2],gg[2])]
        try: # traitement des erreurs si l'algorithme diverge
            ng=np.sqrt(np.sum([np.sum([gi**2 for gi in g[k]]) for k in range(2)]))/len(batch)
            if ng<tolerance:
                print('L\'algorithme a convergé en',i,'itérations. \nSolution atteinte :\n W=',W,'\n W1=',W1,'\n W2=',W2,'\nGradient :',g)
                return [W,W1,W2]
            W=[W[k]-tau*g[0][k] for k in range(3)]
            W1=[W1[k]-tau*g[1][k] for k in range(5)]
            W2=[W2[k]-tau*g[2][k] for k in range(5)]
        except OverflowError as err: # traitement de l'erreur "overflow"
            print('L\'algorithme a divergé \nSolution atteinte :\n W=',W,'\n W1=',W1,'\n W2=',W2,'\nGradient :',g)
            diverge=True
            break
    if (diverge==False):
        print('L\'algorithme n\'a pas convergé \nSolution atteinte :\n W=',W,'\n W1=',W1,'\n W2=',W2,'\nGradient :',g,'\n Norme :',ng)
    return [W,W1,W2]

In [441]:
myW=[[1,1,1],[1,1,1,1,1],[1,1,1,1,1]]

In [475]:
np.random.randint(0, 16, 5)

array([15, 15,  8,  1, 10])

In [None]:
myW=[[1,1,1],[1,1,1,1,1],[1,1,1,1,1]]

In [506]:
myW=descente(myW[0],myW[1],myW[2],0.01,0.0001,10000)

L'algorithme n'a pas convergé 
Solution atteinte :
 W= [-8.46517392992448, 3.47022918504678, 3.47022918504678] 
 W1= [-5.344774733540449, 5.874995956316497, 5.891632845233985, 5.8910944599047745, 5.890279764120708] 
 W2= [-5.344774733540449, 5.874995956316497, 5.891632845233985, 5.8910944599047745, 5.890279764120708] 
Gradient : [array([0.10531566, 0.10517169, 0.10517169]), array([3.69485609e-04, 1.23453748e-06, 1.44529320e-04, 2.25367504e-04,
       1.44529877e-04]), array([3.69485609e-04, 1.23453748e-06, 1.44529320e-04, 2.25367504e-04,
       1.44529877e-04])] 
 Norme : 0.03644929956977157


In [507]:
for j in range(n):
    print(Ydata.iloc[j],R(myW[0],myW[1],myW[2],j))

0 0.00021769359271817977
0 0.016770443615708133
0 0.016792112451707926
0 0.17714442849890388
0 0.01680644464651939
0 0.17714529795351788
1 0.17714661274958873
0 0.1787636698034009
0 0.01636814702678062
1 0.1771182153708943
0 0.17711955199207383
0 0.1787635941657336
0 0.177120434696596
0 0.1787635966331923
0 0.17876360036447617
0 0.17876816587350325
