# Projet Element Logiciel

In [17]:
import numpy as np
import pandas as pd
import random as rd
import copy
import time

## Data preparation

In [18]:
#On travaille sur un petit texte pour commencer extrait d'un article wikipedia
text  = open("data/TheBeatles.txt", "r") 
text = text.readlines()[0]

#Preprocessing : on supprime la ponctuation
not_alphabet="'?./§,;:!»«()…-" 
for i in not_alphabet:
    text = text.replace(i, "")
text = text.split(" ")
text_serie = pd.Series(text)
text_serie.head()

0     Après
1     avoir
2    débuté
3      sous
4        le
dtype: object

In [19]:
#one-hot encoding
from sklearn.preprocessing import LabelBinarizer
text_lb = LabelBinarizer()
X_hot = text_lb.fit_transform(text_serie.values)

In [20]:
X_hot

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [21]:
np.shape(X_hot)
#Le ième mot du texte est le mot j => X_hot_ij = 1
#X_hot : n*V
#n : nombre de mot dans le texte
#V : nombre de mots différents dans le texte

(1300, 645)

In [22]:
X = np.apply_along_axis(np.argmax, 1, X_hot)

In [23]:
X
#Il est plus simple de représenter le texte sous cette forme
#X_i est l'identifiant du ième mot du texte

array([ 40, 171, 263, ..., 596,   0,   0], dtype=int64)

## Hogwild implementation

### in Python

In [24]:
# On commence par rédiger le code sans parallèlisation 
def sig(x):
    return 1/(1+np.exp(-x))

def hog_loop(Min,Mout,alpha,wout,Nwin,negative):
    #Min matrice dans V*d
    #Mout matrice dans V*d
    #wout un élément de 0:V-1
    #Nwin n entiers de 0:V-1
    V,d=Min.shape
    N=Nwin.shape[0]
    for i in range(N):
        input_word = Nwin[i]
        temp = np.array([0]*d)
        for k in range(negative+1):
            if k == 0:
                target_word = wout
                label = 1
            else : 
                target_word = rd.randint(0,V-1)
                label = 0
            inn = np.dot(Min[input_word,:],Mout[target_word,:])
            err = label - sig(inn)
            temp =temp + err*Mout[target_word,:]
            Mout[target_word,:] = Mout[target_word,:] + alpha*err*Min[input_word,:]
        Min[input_word,:] = Min[input_word,:] + alpha*temp
    return(Min,Mout)
    

In [25]:
def train_word2vec(text,params):
    
    N = params['N']
    n_epochs = params['n_epochs']
    alpha = params['alpha']
    negative = params['negative']
    d = params['d']
    
    V = np.max(X)+1
    n_words = np.shape(X)[0]
    
    Min = np.array([rd.random() for i in range(V*d)]).reshape((V,d))
    Mout = np.array([rd.random() for i in range(V*d)]).reshape((V,d))
    
    for i in range(n_epochs):
        
        epochs_order = np.array([t for t in range(N,n_words-N)])
        np.random.shuffle(epochs_order)
        
        for j in epochs_order:
            
            wout = X[j]
            
            Nwin = []
            for k in range(N):
                Nwin.append(X[j-N+k])
            for k in range(N):
                Nwin.append(X[j+k+1])
            Nwin = np.array(Nwin)
                
            Min, Mout = hog_loop(Min,Mout,alpha,wout,Nwin,negative)
            
    
    return Min,Mout

In [26]:
params = {
    "N" : 2,
    "n_epochs" : 100,
    "alpha" : 10**(-3),
    "negative" : 4,
    "d" : 10
}


In [27]:
t=time.time()
Min,Mout = train_word2vec(text=X, params=params)
t=time.time()-t
print("temps pour effectuer %d epochs : %s" % (params['n_epochs'],t))
print(Min)

temps pour effectuer 100 epochs : 55.22755193710327
[[-1.00512081 -0.04997956 -0.95965587 ... -0.99662608 -1.19826614
  -1.31668949]
 [-0.4440905   0.3948632  -0.11329952 ... -0.45706359 -0.13268179
  -0.38565743]
 [-0.20827107  0.11476191 -0.47287415 ... -0.31884309  0.17539378
   0.28877732]
 ...
 [ 0.47178256 -0.20667278  0.55916136 ... -0.18989756  0.32222691
   0.1966601 ]
 [-0.0050664   0.40778788 -0.18933195 ... -0.17228613  0.0366665
   0.47896505]
 [-0.74047766 -0.22102198  0.17123655 ... -0.03146855 -0.50424612
  -0.66055687]]


### in Pycuda

In [28]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

RMQ Dupre : 

regarder 
// ++i i++
   for(auto it: temp)
       *it = 0;
    
for(float* it = temp; it != ; ++it)
      *it = 0;
      
RMQ : peut-être pas la peine de passer random en entier à chaque loop

Faut-il plus de rnadom ?? -> r = (1664525*r+1013904223) % 4294967296

malloc --> free ? OUI : simplement free(variable);

In [29]:
mod = SourceModule("""

#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <curand.h>
#include <math.h>

__device__ void loop(float *Min, float *Mout, float alpha, int *Nwin, int wout, int V, int d, int N, int negative, int random){
    
    /*
    HOGWILD LOOP
    
    float *Min : initialisé en dehors de pycuda, poids de la PREMIERE couche du RN (attention float * et non float **) (V*d)
    float *Mout : initialisé en dehors de pycuda, poids de la DEUXIEME couche du RN (attention float * et non float **) (V*d)
    float alpha : learning rate
    int *Nwin : 2*N input du contexte de l'output
    int wout : output
    int V : taille du vocabulaire (nombre de mots différents)
    int d : taille de l'espace de représentation
    int N : taille du contexte utilisé pour l'apprentissage (de chaque coté donc 2*N en tout)
    int negative : nb de negative utilisé pour l'apprentissage
    int random : un entier aléatoire (difficulté pour générer de l'aléat sur le device) pour les negative
    
    Une boucle de l'Algo 1 de l'article. Permet de mettre à jour Min et Mout pour un wout (et son contexte associé).
    */

    /* Init variables */
    float* temp;
    temp = (float *)malloc(sizeof(float)*d);
    int label;
    int target_word;
    float inn;
    float err;
    int r;
    
    /* Boucle principale sur les 2*N inputs*/
    for(int i=0;i<2*N;i++){
        int input_word = Nwin[i];
        for(int j=0;j<d;j++){
            temp[j]=0;
        }
        for(int k=0; k<negative+1;k++){
            if (k==0){
                target_word = wout;
                label = 1;
            } else {
                /* negative sampling */
                r = (1664525*r+1013904223) % 4294967296;
                target_word = r%V;
                label = 0;
            }
            inn = 0;
            for(int j=0;j<d;j++){
                inn = inn + Min[input_word*d+j]*Mout[target_word*d+j];
            }
            err = label-(1/(1+exp(-inn)));
            for(int j=0;j<d;j++){
                temp[j] = temp[j]+err*Mout[target_word*d+j];
            }
            for(int j=0;j<d;j++){
                Mout[target_word*d+j] = Mout[target_word*d+j]+alpha*err*Min[input_word*d+j];
            }  
        }
        for(int j=0;j<d;j++){
            Min[input_word*d+j] = Min[input_word*d+j]+alpha*temp[j]; 
        }
    }
    
    free(temp);
}


__global__ void parallel(float *Min, float* Mout, int random, int* cst_int, float* cst_float, int* targets, int* contexts) {

    /*
    Parallélisation
    float *Min : Min initialisé
    float *Mout : Mout initialisé
    int* random : un entier aléatoire généré hors du device
    int* cst_int : les constantes entières utiles (V,d,negative,N)
    float* cst_float : les constantes float utiles (alpha)
    int* targets : on va travailler sur wout=targets[idx]
    int* context : le contexte associé sera contexts[idx]
    */

    /*préparation des paramètres pour la fonction loop*/
    
    
    /*constantes entières passés depuis le code python*/
    int V = cst_int[0];
    int d = cst_int[1];
    int negative = cst_int[2];
    int N = cst_int[3];

    /*constante float passé depuis python*/
    float alpha = cst_float[0];
    
    /*l'index correspond au thread et indique simplement le wout sur lequel on travaille*/
    int index = threadIdx.x; 
    int wout = targets[index];
    
    /*Context de wout*/
    int *Nwin;
    Nwin = (int *)malloc(sizeof(int)*N*2);
    for (int i=0;i<2*N;i++){
        Nwin[i] = contexts[index*2*N+i];
    }
    
    random = random+index;
    
    loop(Min,Mout,alpha,Nwin,wout,V,d,N,negative,random);
    
    free(Nwin);
}
""")

In [30]:
def train_word2vec_parallel(text, params, func):
    N = params['N']
    n_epochs = params['n_epochs']
    alpha = params['alpha']
    negative = params['negative']
    d = params['d']
    
    V = np.max(X)+1
    n_words = np.shape(X)[0]
    
    cst_int = np.array([V,d,negative,N])
    cst_float = np.array([alpha])
    
    cst_int = cst_int.astype(np.int32)
    cst_float = cst_float.astype(np.float32)
    
    Min = np.array([rd.random() for i in range(V*d)]).reshape((V,d))
    Mout = np.array([rd.random() for i in range(V*d)]).reshape((V,d))
    
    Min = Min.astype(np.float32)
    Mout = Mout.astype(np.float32)
    
    for i in range(n_epochs):
        
        epochs_order = np.array([t for t in range(N,n_words-N)])
        np.random.shuffle(epochs_order)
        
        for j in range(n_words//1000+1):
            sub_epochs_order = epochs_order[j*1000:(j+1)*1000]
            
            n_threads = np.shape(sub_epochs_order)[0]
            
            contexts=[]
            targets=[]
            
            for k in sub_epochs_order : 
                targets.append(X[k])
                Nwin=[]
                for l in range(N):
                    Nwin.append(X[k-N+l])
                for l in range(N):
                    Nwin.append(X[k+l+1])
                contexts.append(Nwin)
            
            targets = np.array(targets)
            contexts = np.array(contexts)
            
            targets = targets.astype(np.int32)
            contexts = contexts.astype(np.int32)
            
            r = np.array([rd.randint(1,1000000)])
            r = r.astype(np.int32)
            
            Min_gpu = cuda.mem_alloc(Min.nbytes)
            Mout_gpu = cuda.mem_alloc(Mout.nbytes)
            r_gpu = cuda.mem_alloc(r.nbytes)
            cst_int_gpu = cuda.mem_alloc(cst_int.nbytes)
            cst_float_gpu = cuda.mem_alloc(cst_float.nbytes)
            targets_gpu = cuda.mem_alloc(targets.nbytes)
            contexts_gpu = cuda.mem_alloc(contexts.nbytes)

            cuda.memcpy_htod(Min_gpu, Min)
            cuda.memcpy_htod(Mout_gpu, Mout)
            cuda.memcpy_htod(r_gpu, r)
            cuda.memcpy_htod(cst_int_gpu,cst_int)
            cuda.memcpy_htod(cst_float_gpu,cst_float)
            cuda.memcpy_htod(contexts_gpu,contexts)
            cuda.memcpy_htod(targets_gpu,targets)
            
            func(Min_gpu, Mout_gpu, r_gpu, cst_int_gpu, cst_float_gpu, targets_gpu, contexts_gpu, block=(n_threads,1,1))
            
            cuda.memcpy_dtoh(Min, Min_gpu)
            cuda.memcpy_dtoh(Mout, Mout_gpu)


    
    return Min,Mout
    

### Tests

In [31]:
func = mod.get_function("parallel")

In [32]:
t0=time.time()
Min, Mout = train_word2vec_parallel(text=X, params=params, func=func)
dt=time.time()-t0
print("temps d'execution de %d epochs" % params["n_epochs"])
print(dt)
print(Min)

temps d'execution de 100 epochs
1.3676567077636719
[[ 1.0032915e+00  9.6302933e-01  1.1017778e+00 ...  1.0325381e+00
   4.6229160e-01 -4.9877691e-01]
 [ 1.3147280e-01  9.8225397e-01  1.9095585e-01 ...  5.9697044e-01
   8.7365127e-01 -4.6755811e-03]
 [ 4.8149750e-01  7.8118622e-01  4.8062629e-01 ...  3.6831555e-01
   6.5155959e-01  1.5294552e-01]
 ...
 [ 3.0932373e-01  4.8600227e-01  7.4486464e-01 ...  9.8725528e-02
   6.3812017e-01  5.5493981e-01]
 [ 2.6830742e-01  1.1516989e-01  6.2688410e-01 ...  5.8861768e-01
   6.8537962e-01  3.7184832e-01]
 [ 9.0043718e-01  6.2832558e-01  1.6175668e-01 ...  3.6975247e-01
   8.0342275e-01 -1.4484519e-04]]


In [None]:
"""10    : 0.00421"""
"""100   : 0.00507"""
"""1000  : 0.00400"""
"""see https://stackoverflow.com/questions/9985912/how-do-i-choose-grid-and-block-dimensions-for-cuda-kernels"""
"""https://github.com/IntelLabs/pWord2Vec"""

## Paper amelioration

In [45]:
mod_2 = SourceModule("""


#include <stdlib.h>
#include <stdio.h>
#include <curand.h>
#include <math.h>


__device__ void multiplication(float* A, float* B, float* C, int* dim){
    
    int lA = dim[0];
    int cA = dim[1];
    int lB = dim[2];
    int cB = dim[3];
    
    if (cA!=lB){
        return ;
    }
    
    for(int i=0; i<lA ; i++){
        for(int j=0; j<cB ; j++){
            C[i*cB+j] = 0;
        }
    }
    
    for(int i=0; i<lA ; i++){
        for(int j=0; j<cB ; j++){
            for(int k=0 ; k<cA ; k++){
                C[i*cB+j] = C[i*cB+j] + A[i*cA+k]*B[k*cB+j];
            }
        }
    }
}


__device__ void transpose(float* A, float* TA, int* dim){
    
    int lA = dim[0];
    int cA = dim[1];
      
    for(int i=0; i<cA ; i++){
        for(int j=0; j<lA ; j++){
            TA[i*lA+j] = A[j*cA+i];
        }
    }
}


__device__ void loop(float *Min, float *Mout, float alpha, int *Nwin, int wout, int V, int d, int N, int negative, int r){


    /*Calcul des target_words (wout et negative sample)*/
    int *target_words;
    target_words = (int *)malloc(sizeof(int)*(negative+1));
    for(int i=0; i<negative+1;i++){
        if (i==0){
            target_words[i] = wout;
        } else {
            r = (1664525*r+1013904223) % 4294967296;
            target_words[i] = r%V;
        }
    }
    
    /*label*/
    float *label;
    label = (float *)malloc(sizeof(int)*(2*N*(negative+1)));
    for (int i=0; i<2*N ; i++){
        for (int j=0 ; j<negative+1;j++){
            if (j==0){
                label[i*(negative+1)+j] = 1;
            } else {
                label[i*(negative+1)+j] = 0;
            }
        }
    }
    
    /*sub_Min*/
    float* sub_Min;
    sub_Min = (float *)malloc(sizeof(int)*2*N*d);
    for (int i=0; i<2*N ; i++){
        for (int j=0 ; j<d ; j++){
            sub_Min[i*d+j] = Min[Nwin[i]*d+j];
        }
    }
    
    /*sub_Mout*/
    float* sub_Mout;
    sub_Mout = (float *)malloc(sizeof(int)*(negative+1)*d);
    for (int i=0; i<negative+1 ; i++){
        for (int j=0 ; j<d ; j++){
            sub_Mout[i*d+j] = Mout[target_words[i]*d+j];
        }
    }

    /*sub_Mout_transpose*/
    float* sub_Mout_transpose;
    sub_Mout_transpose = (float *)malloc(sizeof(int)*d*(negative+1));
    int dim_sub_Mout[2];
    dim_sub_Mout[0] = negative+1;
    dim_sub_Mout[1] = d;
    transpose(sub_Mout, sub_Mout_transpose, dim_sub_Mout);
    
    /*INN*/
    float* INN;
    INN = (float *)malloc(sizeof(int)*2*N*(negative+1));
    int dim[4];
    dim[0] = 2*N;
    dim[1] = d;
    dim[2] = d;
    dim[3] = negative+1;
    multiplication(sub_Min, sub_Mout_transpose, INN, dim);
    
    /*ERR*/
    float* ERR;
    ERR = (float *)malloc(sizeof(int)*2*N*(negative+1));
    for (int i=0; i<2*N ; i++){
        for (int j=0 ; j<negative+1;j++){
            ERR[i*(negative+1)+j] = label[i*(negative+1)+j] - INN[i*(negative+1)+j];
        }
    }  
    
    /*TEMP*/
    float* TEMP;
    TEMP = (float *)malloc(sizeof(int)*2*N*d);
    dim[0] = 2*N;
    dim[1] = negative+1;
    dim[2] = negative+1;
    dim[3] = d;
    multiplication(ERR, sub_Mout, TEMP, dim);
    
    /*MAJ Mout*/
    float* ERR_transpose_alpha;
    ERR_transpose_alpha = (float *)malloc(sizeof(int)*(negative+1)*2*N);
    for (int i=0; i<negative+1 ; i++){
        for (int j=0 ; j<2*N;j++){
            ERR_transpose_alpha[i*2*N+j] = alpha*ERR[j*(negative+1)+i]; 
        }
    }
    dim[0] = negative+1;
    dim[1] = 2*N;
    dim[2] = 2*N;
    dim[3] = d;
    multiplication(ERR_transpose_alpha, sub_Min, sub_Mout, dim);
    for (int i=0; i<negative+1 ; i++){
        for (int j=0 ; j<d;j++){
            Mout[target_words[i]*d+j] = sub_Mout[i*d+j]; 
        }
    }
    
    /*MAJ Min*/
    for(int i = 0;i<2*N;i++){
        for(int j = 0;j<d;j++){
            Min[Nwin[i]*d+j] = alpha*TEMP[i*d+j];
        }
    }
    
    free(target_words);
    free(label);
    free(sub_Min);
    free(sub_Mout);
    free(sub_Mout_transpose);
    free(INN);
    free(ERR);
    free(TEMP);
    free(ERR_transpose_alpha);

}



__global__ void parallel(float *Min, float* Mout, int random, int* cst_int, float* cst_float, int* targets, int* contexts) {

    /*constantes entières passés depuis le code python*/
    int V = cst_int[0];
    int d = cst_int[1];
    int negative = cst_int[2];
    int N = cst_int[3];

    /*constante float passé depuis python*/
    float alpha = cst_float[0];
    
    /*l'index correspond au thread et indique simplement le wout sur lequel on travaille*/
    int index = threadIdx.x; 
    int wout = targets[index];
    
    /*Context de wout*/
    int *Nwin;
    Nwin = (int *)malloc(sizeof(int)*N*2);
    for (int i=0;i<2*N;i++){
        Nwin[i] = contexts[index*2*N+i];
    }
    
    random = random+index;
    
    loop(Min,Mout,alpha,Nwin,wout,V,d,N,negative,random);
    
}

""")

kernel.cu



In [None]:
#Test multiplication
A = np.array([[1,2],[3,4],[5,6]])
B = np.array([[1,2,3],[3,2,1]])
C = np.array([[1,0,0],[1,0,0],[1,0,0]])
dim = np.array([np.shape(A)[0],
      np.shape(A)[1],
      np.shape(B)[0],
      np.shape(B)[1]])

A = A.astype(np.float32)
B = B.astype(np.float32)
C = C.astype(np.float32)
dim = dim.astype(np.int32)

A_gpu = cuda.mem_alloc(A.nbytes)
B_gpu = cuda.mem_alloc(B.nbytes)
C_gpu = cuda.mem_alloc(C.nbytes)
dim_gpu = cuda.mem_alloc(dim.nbytes)

cuda.memcpy_htod(A_gpu, A)
cuda.memcpy_htod(B_gpu, B)
cuda.memcpy_htod(C_gpu, C)
cuda.memcpy_htod(dim_gpu, dim)

func = mod.get_function("multiplication")

func(A_gpu, B_gpu, C_gpu, dim_gpu, block=(1,1,1))

cuda.memcpy_dtoh(C,C_gpu)

print(C)