In [1]:
import numpy as np
from numpy.linalg import norm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

In [2]:
raw_dataset_file = "datasets/global_dataset.txt"
clean_dataset_file = "datasets/global_datasetStd.txt"

In [3]:
with open(clean_dataset_file, "r") as f:
    clean_dataset = f.read().splitlines()

In [4]:
from gensim.models import KeyedVectors

model = KeyedVectors.load("models/w2vec_model_d300_global_Std")
dim = 300

def encode(msg, model, dim):
    return np.mean([model[word] for word in msg if word in model] or [np.zeros(dim)], axis = 0)

In [5]:
%%time
#encodage de l'échantillon pour créer le jeu de données
dataset_encoded = np.array([encode(msg, model, dim) for msg in clean_dataset])

CPU times: user 55.4 s, sys: 592 ms, total: 56 s
Wall time: 56 s


In [6]:
dataset_train, dataset_test = train_test_split(dataset_encoded, test_size=0.2)

### Modèle

In [7]:
inputs = keras.Input(shape=(300,))
encode1 = keras.layers.Dense(128, activation="relu")(inputs)
encode2 = keras.layers.Dense(64, activation="relu")(encode1)
encoded = keras.layers.Dense(16)(encode2)
decode1 = keras.layers.Dense(64, activation="relu")(encoded)
decode2 = keras.layers.Dense(128, activation="relu")(decode1)
decoded = keras.layers.Dense(300, activation="tanh")(decode2)

encoder = keras.Model(inputs, encoded, name="encoder")
autoencoder = keras.Model(inputs, decoded, name="autoencoder")

2022-05-25 15:51:49.627014: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 15:51:49.658361: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 15:51:49.658822: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 15:51:49.659824: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [8]:
autoencoder.compile(optimizer='adam', loss='mse')

### Entraînement

In [9]:
savemodel_callback = tf.keras.callbacks.ModelCheckpoint(filepath="models/ae_Std_feature_extr", verbose=0, save_best_only=True)

In [10]:
autoencoder.fit(dataset_train, dataset_train,
                epochs=30,
                batch_size=256,
                shuffle=True,
                validation_data=(dataset_test, dataset_test),
                callbacks=[savemodel_callback])

2022-05-25 15:51:59.090193: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 690456000 exceeds 10% of free system memory.
2022-05-25 15:51:59.437078: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 690456000 exceeds 10% of free system memory.
2022-05-25 15:51:59.676229: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 690456000 exceeds 10% of free system memory.
2022-05-25 15:51:59.839476: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 690456000 exceeds 10% of free system memory.


Epoch 1/30

2022-05-25 15:52:07.501219: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/ae_Std_feature_extr/assets
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f5addc113a0>

In [11]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [12]:
n = 42
cosine_similarity(autoencoder.predict(dataset_encoded[n:n+1]), dataset_encoded[n])

array([0.99990944])

In [13]:
autoencoder.predict(dataset_encoded[n:n+1])[0,:10]

array([ 0.01277449,  0.1481286 , -0.00755385, -0.17522438, -0.19507536,
        0.00925588, -0.04351522,  0.20651558, -0.0510514 , -0.12311283],
      dtype=float32)

In [14]:
dataset_encoded[n,:10]

array([ 0.01438789,  0.14908712, -0.00646923, -0.17402576, -0.19464563,
        0.00539072, -0.04192283,  0.2035947 , -0.04943962, -0.12419792])