In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Dropout
from tensorflow.keras.models import Model
from experiment_baseplate import load_split_data, get_text_data

In [2]:
max_words = 10000 # Nombre maximum de mots à utiliser dans le tokenizer
max_len = 100 # Longueur maximale des séquences d'entrée

# Tokenizer pour les textes
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(get_text_data())
word_index = tokenizer.word_index
print("Nombre de mots dans l'index : ", len(word_index))

Nombre de mots dans l'index :  72325


In [3]:
X_train, y_train, X_validate, y_validate, X_test, y_test = load_split_data()

seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)
seq_validate = tokenizer.texts_to_sequences(X_validate)

X_train = pad_sequences(seq_train, maxlen=max_len)
X_test = pad_sequences(seq_test, maxlen=max_len)
X_validate = pad_sequences(seq_validate, maxlen=max_len)

In [49]:
embedding_dim = 100
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(max_words, embedding_dim)(input_layer)
lstm_layer = LSTM(64)(embedding_layer)
dropout_layer = Dropout(0.5)(lstm_layer)
output_layer = Dense(2, activation='softmax')(dropout_layer)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 lstm_4 (LSTM)               (None, 64)                42240     
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 2)                 130       
                                                                 
Total params: 1,042,370
Trainable params: 1,042,370
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_validate, y_validate))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f99c87d3190>

In [51]:
loss, acc = model.evaluate(X_test, y_test)
print("Accuracy : ", acc)

Accuracy :  0.9048895835876465


In [67]:
model.predict(pad_sequences(tokenizer.texts_to_sequences(["hello"]), maxlen=max_len))



array([[0.90230155, 0.0976985 ]], dtype=float32)

In [64]:
model.save_weights('models/lstm_selfembed/checkpoint1')