In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Dropout
from tensorflow.keras.models import Model
from experiment_baseplate import load_split_data, get_text_data
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors



In [2]:
max_words = 10000 # Nombre maximum de mots à utiliser dans le tokenizer
max_len = 100 # Longueur maximale des séquences d'entrée

# Tokenizer pour les textes
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(get_text_data())
word_index = tokenizer.word_index
print("Nombre de mots dans l'index : ", len(word_index))

Nombre de mots dans l'index :  72629


In [3]:

# Prepare Embeddings
def load_glove_model(File):
    glove_model = {}
    print("Loading Glove Model")
    with open(File,'r',encoding="utf8") as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_vectors = load_glove_model('pretrained/glove/glove.twitter.27B.200d.txt')

Loading Glove Model
1193514 words loaded!


In [4]:
def get_sentence_embedding(sentence):
    sentence_embedding = []
    for word in simple_preprocess(sentence):
        if word in glove_vectors:
            sentence_embedding.append(glove_vectors[word])
    if len(sentence_embedding) > 0:
        return np.mean(sentence_embedding, axis=0)
    else:
        return np.zeros(200)

In [5]:
max_words = 10000 # Nombre maximum de mots à utiliser dans le tokenizer
max_len = 100 # Longueur maximale des séquences d'entrée
embedding_dim  = 200
# Load our data
X_train, y_train, X_validate, y_validate, X_test, y_test = load_split_data()




# Vectorize our sentences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_validate = tokenizer.texts_to_sequences(X_validate)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_validate = pad_sequences(X_validate, maxlen=max_len)




In [6]:
vocab_len = len(word_index) +1
emb_matrix = np.zeros((vocab_len,embedding_dim))

for word, index in word_index.items():
    if word in glove_vectors:
        embedding_vector = glove_vectors[word]
        emb_matrix[index, :] = embedding_vector
    

In [7]:

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_dim, input_length=max_len, weights = [emb_matrix], trainable=False)(input_layer)
lstm_layer = LSTM(64)(embedding_layer)
dropout_layer = Dropout(0.5)(lstm_layer)
output_layer = Dense(2, activation='softmax')(dropout_layer)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 200)          14526000  
                                                                 
 lstm (LSTM)                 (None, 64)                67840     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 14,593,970
Trainable params: 67,970
Non-trainable params: 14,526,000
_________________________________________________________________


In [8]:
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_validate, y_validate))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2b1b244bd00>

In [9]:
loss, acc = model.evaluate(X_test, y_test)
print("Accuracy : ", acc)

Accuracy :  0.8870525360107422


In [25]:
model.predict(pad_sequences(tokenizer.texts_to_sequences(["hello"]), maxlen=max_len))



array([[0.90541935, 0.09458064]], dtype=float32)