We will use "IMDB movie review sentiment classification dataset"

Dataset Description: https://keras.io/api/datasets/imdb/

This is a dataset of 25,000 movie reviews from IMDB, tagged by sentiment (positive/negative). The reviews have been preprocessed and each review is coded as a list of (whole) word indexes. For convenience, words are indexed by their overall frequency in the dataset, so that, for example, the integer "3" encodes the 3rd most frequent word in the data.

In [1]:
#!pip install tensorflow

In [84]:
import numpy
import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.python.keras.layers.convolutional import Conv1D
from tensorflow.python.keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Flatten

numpy.random.seed(7)


In [None]:
db=imdb.load_data()

In [None]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [None]:
len(X_train)

25000

In [None]:
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [None]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
print(X_train)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
print(X_test)

[[   0    0    0 ...   19  178   32]
 [   0    0    0 ...   16  145   95]
 [   0    0    0 ...    7  129  113]
 ...
 [   0    0    0 ...    4 3586    2]
 [   0    0    0 ...   12    9   23]
 [   0    0    0 ...  204  131    9]]
[[   0    0    0 ...   14    6  717]
 [   0    0    0 ...  125    4 3077]
 [  33    6   58 ...    9   57  975]
 ...
 [   0    0    0 ...   21  846    2]
 [   0    0    0 ... 2302    7  470]
 [   0    0    0 ...   34 2005 2643]]


In [None]:
X_train.shape


(25000, 500)

we will use the embedding layer which defines the first hidden layer of the network. it must specify 3 arguments:

input_dim: the size of the vocabulary in the text

output_dim: this is the size of the vector space in which each word will be immersed

input_legth: this is the size of the sequence, for example if your documents contain 100 words each then it is 100

In [None]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.build(input_shape=(None, max_review_length))  # Build the model
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=64)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper_3 (ModuleWra  (None, 500, 32)          160000    
 pper)                                                           
                                                                 
 module_wrapper_4 (ModuleWra  (None, 500, 32)          3104      
 pper)                                                           
                                                                 
 module_wrapper_5 (ModuleWra  (None, 250, 32)          0         
 pper)                                                           
                                                                 
 lstm_1 (LSTM)               (None, 100)               53200     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                      

<keras.callbacks.History at 0x2518bf5ad00>

In [None]:
# evaluation
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 88.46%


## simple example of the embedding layer

In [None]:
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']

In [None]:
labels = [1,1,1,1,1,0,0,0,0,0]

In [None]:
vocab_size = 50

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

encoded_docs = [one_hot(d, vocab_size) for d in docs]

In [None]:
print(encoded_docs)

[[33, 49], [39, 34], [41, 43], [49, 34], [46], [21], [1, 43], [8, 39], [1, 34], [25, 8, 49, 2]]


In [None]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[33 49  0  0]
 [39 34  0  0]
 [41 43  0  0]
 [49 34  0  0]
 [46  0  0  0]
 [21  0  0  0]
 [ 1 43  0  0]
 [ 8 39  0  0]
 [ 1 34  0  0]
 [25  8 49  2]]


We are now ready to define our Embedding layer as part of our model.

The embedding has a vocabulary of 50 and an entry length of 4. We will choose a small embedding space of 8 dimensions.

The model is a simple binary classification model. It is important to note that the output of the Embedding layer will be 4 vectors of 8 dimensions each, one for each word. We flatten it (the flatten layer) into a 32-element vector to pass it to the Dense output layer. 

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# Build the model
model.build(input_shape=(None, max_length))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summarize the model
print(model.summary())


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 module_wrapper_6 (ModuleWra  (None, 4, 8)             400       
 pper)                                                           
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
import numpy as np
labels = np.array(labels)
model.fit(padded_docs, labels, epochs=50, verbose=0)


<keras.callbacks.History at 0x251993876d0>

In [None]:
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.999998


## To Do: 

1. Try the same thing on Google reviews dataset ( the file is given in the lab directory)
2. try to change the embedding representation using Glove and Skipgram 

###### Importation des bibliothèques nécessaires

In [148]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM
from keras.layers import Dense





###### Chargez les données 

In [149]:
data = pd.read_csv('reviews.csv')
data.head()

Unnamed: 0,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,Andrew Thomas,https://lh3.googleusercontent.com/a-/AOh14GiHd...,Update: After getting a response from the deve...,1,21,4.17.0.3,2020-04-05 22:25:57,"According to our TOS, and the term you have ag...",2020-04-05 15:10:24,most_relevant,com.anydo
1,Craig Haines,https://lh3.googleusercontent.com/-hoe0kwSJgPQ...,Used it for a fair amount of time without any ...,1,11,4.17.0.3,2020-04-04 13:40:01,It sounds like you logged in with a different ...,2020-04-05 15:11:35,most_relevant,com.anydo
2,steven adkins,https://lh3.googleusercontent.com/a-/AOh14GiXw...,Your app sucks now!!!!! Used to be good but no...,1,17,4.17.0.3,2020-04-01 16:18:13,This sounds odd! We are not aware of any issue...,2020-04-02 16:05:56,most_relevant,com.anydo
3,Lars Panzerbjørn,https://lh3.googleusercontent.com/a-/AOh14Gg-h...,"It seems OK, but very basic. Recurring tasks n...",1,192,4.17.0.2,2020-03-12 08:17:34,We do offer this option as part of the Advance...,2020-03-15 06:20:13,most_relevant,com.anydo
4,Scott Prewitt,https://lh3.googleusercontent.com/-K-X1-YsVd6U...,Absolutely worthless. This app runs a prohibit...,1,42,4.17.0.2,2020-03-14 17:41:01,We're sorry you feel this way! 90% of the app ...,2020-03-15 23:45:51,most_relevant,com.anydo


Nous avons choisi les colonnes "score" et "content" comme exemples pour les étiquettes de sentiment et les avis . 

In [150]:
docs = data['content'].tolist()
labels = data['score'].tolist()

1 = très négatif, 2 = négatif, 3 = neutre, 4 = positif, 5 = très positif.

In [151]:
X = data['content']
Y = data['score']
df = pd.DataFrame({'content': X, 'score': Y})
print(df)

                                                 content  score
0      Update: After getting a response from the deve...      1
1      Used it for a fair amount of time without any ...      1
2      Your app sucks now!!!!! Used to be good but no...      1
3      It seems OK, but very basic. Recurring tasks n...      1
4      Absolutely worthless. This app runs a prohibit...      1
...                                                  ...    ...
15741  I believe that this is by far the best app wit...      5
15742                       It sometimes crashes a lot!!      5
15743                         Works well for what I need      5
15744                                           Love it.      5
15745  Really amazing and helped me sooo much just i ...      5

[15746 rows x 2 columns]


une opération de transformation sur les valeurs de la liste Y.

Le but de cette transformation est de créer une nouvelle liste appelée labels qui représente une étiquette binaire pour chaque valeur de Y. Si la valeur dans Y est inférieure ou égale à 2, alors 0 est ajouté à labels, sinon 1 est ajouté.

In [152]:
labels = []
for value in Y.values:
    if value <=2 :
        labels.append(0)
    else :
        labels.append(1)
print(len(labels))

15746


In [137]:
X

0        Update: After getting a response from the deve...
1        Used it for a fair amount of time without any ...
2        Your app sucks now!!!!! Used to be good but no...
3        It seems OK, but very basic. Recurring tasks n...
4        Absolutely worthless. This app runs a prohibit...
                               ...                        
15741    I believe that this is by far the best app wit...
15742                         It sometimes crashes a lot!!
15743                           Works well for what I need
15744                                             Love it.
15745    Really amazing and helped me sooo much just i ...
Name: content, Length: 15746, dtype: object

 # Prétraitement des données textuelles.

In [138]:
# Préparer le tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)# Ajustement du tokenizer sur les textes X pour créer la représentation numérique des mots.
vocab_size = len(tokenizer.word_index) + 1  # Calcul de la taille du vocabulaire du tokenizer
print("Le vocab size : ",vocab_size)


encoded_docs = tokenizer.texts_to_sequences(X)
#print(encoded_docs)

max_length = max([len(seq) for seq in encoded_docs])  # Calcul de la longueur maximale parmi toutes les séquences encodées
print("Le max length:",max_length)



padded_docs1 = pad_sequences(encoded_docs, maxlen=max_length) # Rembourrage des séquences numériques pour avoir la même longueur
print(padded_docs1)


train_X, val_X, train_Y, val_Y = train_test_split(padded_docs1, labels, test_size=0.2, random_state=42)




Le vocab size :  11970
Le max length: 393
[[   0    0    0 ...    8   39  312]
 [   0    0    0 ...  151   40  546]
 [   0    0    0 ...    9   10 1591]
 ...
 [   0    0    0 ...   67    3   75]
 [   0    0    0 ...    0   51    4]
 [   0    0    0 ... 5449    9  390]]


In [139]:
model = Sequential()
model.add(Embedding(vocab_size, 150, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# Build the model
model.build(input_shape=(None, max_length))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Summarize the model
print(model.summary())

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (None, 393, 150)          1795500   
                                                                 
 flatten_2 (Flatten)         (None, 58950)             0         
                                                                 
 dense_18 (Dense)            (None, 1)                 58951     
                                                                 
Total params: 1,854,451
Trainable params: 1,854,451
Non-trainable params: 0
_________________________________________________________________
None


In [142]:
# Now you can proceed with model training
model.fit(padded_docs1, Y, epochs=2, verbose=0)

<keras.callbacks.History at 0x7f725c651690>

In [54]:
loss, accuracy = model.evaluate(padded_docs1, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 97.161186


# Glove

(Global Vectors for Word Representation) :

GloVe est une méthode pour représenter les mots sous forme de vecteurs numériques appelés "embeddings".
Ces embeddings capturent les relations sémantiques entre les mots, ce qui permet de mesurer la similarité et de faire des opérations vectorielles sur les mots.

In [179]:
# Préparer le tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)# Ajustement du tokenizer sur les textes X pour créer la représentation numérique des mots.
vocab_size = len(tokenizer.word_index) + 1  # Calcul de la taille du vocabulaire du tokenizer
print("Le vocab size : ",vocab_size)


encoded_docs = tokenizer.texts_to_sequences(X)
#print(encoded_docs)

max_length = max([len(seq) for seq in encoded_docs])  # Calcul de la longueur maximale parmi toutes les séquences encodées
print("Le max length:",max_length)



padded_docs1 = pad_sequences(encoded_docs, maxlen=max_length) # Rembourrage des séquences numériques pour avoir la même longueur
print("Padded Docs:",padded_docs1)


train_X, val_X, train_Y, val_Y = train_test_split(padded_docs1, labels, test_size=0.2, random_state=42)
train_X = np.array(train_X)
train_Y = np.array(train_Y)
val_X = np.array(val_X)
val_Y = np.array(val_Y)

Le vocab size :  11970
Le max length: 393
Padded Docs: [[   0    0    0 ...    8   39  312]
 [   0    0    0 ...  151   40  546]
 [   0    0    0 ...    9   10 1591]
 ...
 [   0    0    0 ...   67    3   75]
 [   0    0    0 ...    0   51    4]
 [   0    0    0 ... 5449    9  390]]


In [186]:


with open('glove.6B.100d.txt', encoding='utf-8') as f:
    first_line = f.readline()
    embedding_dim = len(first_line.split()) - 1  # Détermination de la dimension de l'embedding à partir de la première ligne du fichier GloVe
print(embedding_dim)

embeddings_index = {}  # Initialisation du dictionnaire pour stocker les embeddings GloVe
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]  # Extraction du mot à partir de la première valeur de chaque ligne
        coefficients = np.asarray(values[1:], dtype='float32')  # Conversion des coefficients de l'embedding en un tableau numpy de type float32
        embeddings_index[word] = coefficients  

# Création d'une matrice d'embedding
embedding_matrix = np.zeros((vocab_size, embedding_dim))  
for word, index in tokenizer.word_index.items():
    if index < vocab_size:
        embedding_vector = embeddings_index.get(word)  # Récupération du vecteur d'embedding correspondant au mot dans le dictionnaire
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector  # Mise à jour de la ligne correspondante dans la matrice d'embedding

model = Sequential() 
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))  # Ajout de la couche Embedding avec les embeddings pré-entrainés et non entraînables
model.add(LSTM(128)) 
model.add(Dense(1, activation='sigmoid'))  
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
print(model.summary()) 
model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=10, batch_size=32)
loss, accuracy = model.evaluate(val_X, val_Y)
print("Accuracy:", accuracy)



100
Model: "sequential_46"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_42 (Embedding)    (None, 393, 100)          1197000   
                                                                 
 lstm_32 (LSTM)              (None, 128)               117248    
                                                                 
 dense_34 (Dense)            (None, 1)                 129       
                                                                 
Total params: 1,314,377
Trainable params: 117,377
Non-trainable params: 1,197,000
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8749206066131592


# SkipGram

L'algorithme Skip-gram se concentre sur la prédiction du contexte à partir d'un mot cible. Plus précisément, il essaie de prédire les mots environnants (le contexte) d'un mot donné (le mot cible) dans un corpus de texte.

In [192]:
# Préparer le tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
print("Le vocab size : ", vocab_size)

# Diviser chaque liste de mots en une autre liste de mots
X = [[word for word in sentence] for sentence in X]

encoded_docs = tokenizer.texts_to_sequences(X)
max_length = max([len(seq) for seq in encoded_docs])
print("Le max length:", max_length)

padded_docs1 = pad_sequences(encoded_docs, maxlen=max_length)
print("Padded Docs:", padded_docs1)

train_X, val_X, train_Y, val_Y = train_test_split(padded_docs1, labels, test_size=0.2, random_state=42)
train_X = np.array(train_X)
train_Y = np.array(train_Y)
val_X = np.array(val_X)
val_Y = np.array(val_Y)

# Diviser chaque liste de mots en une autre liste de mots
model = Word2Vec(sentences=X, sg=1, window=5, vector_size=100, epochs=10)

embedding_dim = model.vector_size

# Création de la matrice d'embedding
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    if index < vocab_size:
        if word in model.wv:
            embedding_vector = model.wv[word]
            embedding_matrix[index] = embedding_vector

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=10, batch_size=32)
loss, accuracy = model.evaluate(val_X, val_Y)
print("Accuracy:", accuracy)


Le vocab size :  22226
Le max length: 391
Padded Docs: [[    0     0     0 ...     7    32  1233]
 [    0     0     0 ...   147    35  3923]
 [    0     0     0 ...     9    10  8076]
 ...
 [    0     0     0 ...    64     3    75]
 [    0     0     0 ...     0    48    74]
 [    0     0     0 ... 15693     9   501]]
Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_47 (Embedding)    (None, 391, 100)          2222600   
                                                                 
 lstm_37 (LSTM)              (None, 128)               117248    
                                                                 
 dense_39 (Dense)            (None, 1)                 129       
                                                                 
Total params: 2,339,977
Trainable params: 117,377
Non-trainable params: 2,222,600
________________________________________________