In [16]:
import keras
import tensorflow as tf
import utils_TP4 as utils
import solution_TP4 as solution

# Utilisation de Dataset (en gardant les anciens modèle et TextVectorizationLayer)

### Récupération et affichage d'une instance dans le corpus pour vérification

In [2]:
ds_train, ds_valid = keras.utils.text_dataset_from_directory(
    "Corpus",
    seed=42,
    validation_split=0.3,
    subset='both')

Found 3498 files belonging to 7 classes.
Using 2449 files for training.
Using 1049 files for validation.


In [3]:
un_elem = ds_train.unbatch().take(1).get_single_element()
un_elem   # equivalent of tst_ds


(<tf.Tensor: shape=(), dtype=string, numpy=b"   Les Combattants est un film fran\xc3\xa7ais r\xc3\xa9alis\xc3\xa9 par Thomas Cailley , sorti\n   le 20 ao\xc3\xbbt 2014 .\n   R\xc3\xa9alisation Sc\xc3\xa9nario Acteurs principaux Pays de production Genre Dur\xc3\xa9e\n   Sortie\n\n   Les Combattants est un film fran\xc3\xa7ais r\xc3\xa9alis\xc3\xa9 par Thomas Cailley , sorti\n   le 20 ao\xc3\xbbt 2014 .\n   R\xc3\xa9alisation Sc\xc3\xa9nario Acteurs principaux Pays de production Genre Dur\xc3\xa9e\n   Sortie\n\n   Les Combattants d'Afrique est une exposition temporaire qui s'est\n   d\xc3\xa9roul\xc3\xa9e du 15 juin 2010 au 31 octobre 2010 au Centre national\n   Jean-Moulin de Bordeaux . Elle s'inscrit dans le cadre de la\n   comm\xc3\xa9moration du 70 ^e anniversaire de l' Appel du 18 Juin 1940 par le\n   g\xc3\xa9n\xc3\xa9ral de Gaulle , et du Cinquantenaire des Ind\xc3\xa9pendances et des\n   actions engag\xc3\xa9es par la municipalit\xc3\xa9 de Bordeaux en direction des\n   anciens c

### Vectorisation du corpus

On `adapt()` le text_vectorizer en laissant de côté les `y` avec la fonction lambda

In [4]:
tv = solution.get_text_vectorizer_from_config(solution.ExpeConfig("whitespace",None,1000))

In [5]:
tv.adapt(ds_train.map(lambda x,y: x))

#### Vérification des structures de données obtenues (on vérifie les types et les shapes)

In [6]:
ds_train.map(lambda x,y: (tv(x),y)).take(1).get_single_element()

(<tf.Tensor: shape=(32, 1000), dtype=float32, numpy=
 array([[143.,  20.,  13., ...,   0.,   0.,   0.],
        [  7.,   2.,   0., ...,   0.,   0.,   0.],
        [111.,  12.,   8., ...,   0.,   0.,   0.],
        ...,
        [ 48.,  10.,   2., ...,   0.,   0.,   0.],
        [ 21.,   6.,   3., ...,   0.,   0.,   0.],
        [132.,  12.,   0., ...,   0.,   0.,   0.]], dtype=float32)>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([4, 6, 4, 1, 5, 0, 0, 2, 1, 6, 1, 6, 6, 6, 6, 1, 0, 6, 2, 5, 1, 3,
        6, 2, 4, 0, 2, 5, 5, 5, 6, 6], dtype=int32)>)

### Création et entraînement du modèle
(et fonction de preprocessing qui peut remplacer la lambda)

In [7]:
model = utils.PerceptronModelSparseCategorical(tv, list(range(7)))

In [8]:
def preproc(x,y):
    return tv(x),y

In [9]:
model.fit(ds_train.map(preproc), validation_data=ds_valid.map(preproc), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x747357e2c0d0>

In [10]:
tv.vocabulary_size

<bound method TextVectorization.vocabulary_size of <keras.src.layers.preprocessing.text_vectorization.TextVectorization object at 0x7473db8f5890>>

In [11]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tokens_count (InputLayer)   [(None, 1000)]            0         
                                                                 
 normalizer (Normalization)  (None, 1000)              2001      
                                                                 
 hidden (Dense)              (None, 14)                14014     
                                                                 
 sortie (Dense)              (None, 7)                 105       
                                                                 
Total params: 16120 (62.97 KB)
Trainable params: 14119 (55.15 KB)
Non-trainable params: 2001 (7.82 KB)
_________________________________________________________________


# Utilisation de plongements (Embeddings)

In [12]:
tv_int = text_vectorizer = keras.layers.TextVectorization(
    max_tokens=3000, # taille du vocabulaire conservé
    output_sequence_length=100, # taille des séquences (tronquées ou en ajoutant du padding)
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int") # changement : "int" au lieu de "count" pour un encodage un token -> un entier

In [13]:
tv_int.adapt(ds_train.map(lambda x,y:x))

In [14]:
one_x = ds_train.unbatch().map(lambda x,y:x).map(tv_int).take(1).get_single_element()
one_x

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([   5, 1953,   65,    2,  122,    6,    8,   65,    1,   72,  120,
          5,   78,    6,    9,  219,  131,    4,    1,   92, 1905,    2,
          1,  906,    3,  228,    1,   39,    3, 1899,    2,    1, 2077,
          3,  237,  335,    5,   21,    2, 1953,   65,    2,  122,   12,
        444,   11,   65,   63,    1,    9,    3,    1,    2,    3, 1899,
          2,    1,    7,    9,    1,    2,    1,   85,  877,   48,  202,
        750,  503, 1930,   78,   16,   62,  107,   36,   40,    1, 2950,
         12,    1,   11, 1953,   65,    2,  122,   63,   12, 1324,    1,
          2,  511,    1,   10,    5, 1953,   65,    2,  122,   73,  106,
         85])>

### On peut vérifier qu'on est capable de réencoder un document pour voir si tout se passe comme prévu

In [15]:
vocab = tv_int.get_vocabulary()

In [16]:
[vocab[i] for i in one_x]

['le',
 'yacht',
 'club',
 'de',
 'france',
 'est',
 'un',
 'club',
 '[UNK]',
 'français',
 'dont',
 'le',
 'siège',
 'est',
 'à',
 'paris',
 'créée',
 'en',
 '[UNK]',
 'sous',
 'légide',
 'de',
 '[UNK]',
 'iii',
 'la',
 'société',
 '[UNK]',
 'pour',
 'la',
 'navigation',
 'de',
 '[UNK]',
 'prend',
 'la',
 'même',
 'année',
 'le',
 'titre',
 'de',
 'yacht',
 'club',
 'de',
 'france',
 'les',
 'activités',
 'du',
 'club',
 'sont',
 '[UNK]',
 'à',
 'la',
 '[UNK]',
 'de',
 'la',
 'navigation',
 'de',
 '[UNK]',
 'et',
 'à',
 '[UNK]',
 'de',
 '[UNK]',
 'fondation',
 'sigle',
 'type',
 'forme',
 'juridique',
 'domaine',
 'dactivité',
 'siège',
 'pays',
 'coordonnées',
 'président',
 'site',
 'web',
 '[UNK]',
 'siren',
 'les',
 '[UNK]',
 'du',
 'yacht',
 'club',
 'de',
 'france',
 'sont',
 'les',
 'différentes',
 '[UNK]',
 'de',
 'course',
 '[UNK]',
 'par',
 'le',
 'yacht',
 'club',
 'de',
 'france',
 'depuis',
 'sa',
 'fondation']

# Une couche d'embeddings

In [17]:
embeddings = keras.layers.Embedding(
    tv_int.vocabulary_size(),
    3, # longueur des vecteurs
    mask_zero=True # important si padding
)
embeddings(one_x)

<tf.Tensor: shape=(100, 3), dtype=float32, numpy=
array([[ 0.02338589, -0.02350946,  0.04454089],
       [-0.03283953,  0.03530599, -0.03119474],
       [ 0.03748092, -0.03771096,  0.04293822],
       [ 0.00127006,  0.02444062,  0.03954462],
       [-0.0345609 ,  0.02477014,  0.02694254],
       [-0.03433607, -0.00805996,  0.02791429],
       [ 0.02642215,  0.02735001,  0.04771307],
       [ 0.03748092, -0.03771096,  0.04293822],
       [-0.03285612,  0.02369666,  0.04971344],
       [-0.04193065, -0.04257564,  0.01240958],
       [-0.02615769, -0.02718338, -0.02671988],
       [ 0.02338589, -0.02350946,  0.04454089],
       [ 0.03798262,  0.01415141, -0.03752277],
       [-0.03433607, -0.00805996,  0.02791429],
       [-0.04183261, -0.03248701,  0.02310926],
       [-0.01814044,  0.0447191 , -0.0272627 ],
       [ 0.04439044, -0.00499889, -0.00600236],
       [ 0.02035601, -0.04105066,  0.00040861],
       [-0.03285612,  0.02369666,  0.04971344],
       [ 0.01475232, -0.04128289, -0.0

# Un modèle avec une couche d'embeddings

In [18]:
def build_model(tv, emb_dim, nb_classes):
    inputs = keras.layers.Input(shape=(100,))
    embeddings = keras.layers.Embedding(
        tv.vocabulary_size(),
        emb_dim,
        mask_zero=True,
        name="emb"
    )(inputs)
    embeddings = keras.layers.Dropout(rate=0.2)(embeddings)
    pooling = keras.layers.GlobalMaxPooling1D()(embeddings)
    classif = keras.layers.Dense(nb_classes, activation="softmax", use_bias=True)(pooling)
    model = keras.Model(inputs=inputs, outputs=classif)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])
    return model
    

In [19]:
model = build_model(tv_int, 300, 7)

In [20]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 emb (Embedding)             (None, 100, 300)          900000    
                                                                 
 dropout (Dropout)           (None, 100, 300)          0         
                                                                 
 global_max_pooling1d (Glob  (None, 300)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 7)                 2107      
                                                                 
Total params: 902107 (3.44 MB)
Trainable params: 902107 (3.44 MB)
Non-trainable params: 0 (0.00 Byte)
_______________________

In [21]:
def preproc_int(x,y):
    return tv_int(x),y

In [22]:
# model.fit(ds_train.map(preproc_int),  validation_data=ds_valid.map(preproc_int), epochs=10)  

# visualisation

création de fichiers tsv prêts à être chargés sur https://projector.tensorflow.org/ 
(cf le code dans utils_TP5.py pour l'extraction des poids qui correspondent aux vecteurs)

In [23]:
from utils_TP5 import write_vectors_proj_format

In [24]:
write_vectors_proj_format(model, tv_int)

# Debut TP 7 

In [1]:
### TP 7 
# Corpus avec étiquette morphosyntaxique

import keras
import tensorflow as tf
from tensorflow.data import TextLineDataset

ds=TextLineDataset("aij-wikiner-fr-wp2")

ds  # entire dataset object  with token|POS tag | BIO tag

2024-03-25 18:57:44.018636: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 18:57:44.338644: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 18:57:44.338715: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 18:57:44.396377: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-25 18:57:44.513214: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 18:57:44.515044: I tensorflow/core/platform/cpu_feature_guard.cc:1

<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [2]:
tst_ds = ds.skip(1).take(1).get_single_element()
tst_ds

<tf.Tensor: shape=(), dtype=string, numpy=b"Il|PRO:PER|O assure|VER:pres|O \xc3\xa0|VER:pper|O la|DET:ART|O suite|NOM|O de|PRP|I-PER Saussure|NAM|I-PER le|DET:ART|O cours|NOM|O de|PRP|O grammaire|NOM|O compar\xc3\xa9e|ADJ|O ,|PUN|O qu'|PRO:REL|O il|PRO:PER|O compl\xc3\xa8te|VER:subp|O \xc3\xa0|VER:pper|O partir|VER:infi|O de|PRP|O 1894|NUM|O par|PRP|O une|DET:ART|O conf\xc3\xa9rence|NOM|O sur|PRP|O l'|DET:ART|O iranien|ADJ|O .|SENT|O">

In [7]:
def tensor_split(tensor):
    # return tf.strings.split(tensor, sep="|", maxsplit=-1, name=None)
    t = tf.strings.split(tensor)
    # return t
    X,y = tf.strings.split(t, sep="|", maxsplit=-1, name=None)[:, :1], tf.strings.split(t, sep="|", maxsplit=-1, name=None)[:, 1:2]
    
    return X, y
      

In [8]:
X_y = ds.map(tensor_split)


In [12]:
# extraction du texte
X_data = X_y.map(lambda x, y: x) # data 
y_labels = X_y.map(lambda x, y: y) # lables


In [13]:
for X in X_data.take(5):  # first 5 text elements
    print("X (text):", X)

X (text): <tf.RaggedTensor []>
X (text): <tf.RaggedTensor [[b'Il'],
 [b'assure'],
 [b'\xc3\xa0'],
 [b'la'],
 [b'suite'],
 [b'de'],
 [b'Saussure'],
 [b'le'],
 [b'cours'],
 [b'de'],
 [b'grammaire'],
 [b'compar\xc3\xa9e'],
 [b','],
 [b"qu'"],
 [b'il'],
 [b'compl\xc3\xa8te'],
 [b'\xc3\xa0'],
 [b'partir'],
 [b'de'],
 [b'1894'],
 [b'par'],
 [b'une'],
 [b'conf\xc3\xa9rence'],
 [b'sur'],
 [b"l'"],
 [b'iranien'],
 [b'.']]>
X (text): <tf.RaggedTensor [[b'En'],
 [b'1905'],
 [b','],
 [b'il'],
 [b'occupe'],
 [b'la'],
 [b'chaire'],
 [b'de'],
 [b'grammaire'],
 [b'compar\xc3\xa9e'],
 [b'au'],
 [b'Coll\xc3\xa8ge'],
 [b'de'],
 [b'France'],
 [b','],
 [b'o\xc3\xb9'],
 [b'il'],
 [b'consacre'],
 [b'ses'],
 [b'cours'],
 [b'\xc3\xa0'],
 [b"l'"],
 [b'histoire'],
 [b'et'],
 [b'\xc3\xa0'],
 [b'la'],
 [b'structure'],
 [b'des'],
 [b'langues'],
 [b'indo-europ\xc3\xa9ennes'],
 [b'.']]>
X (text): <tf.RaggedTensor [[b'Il'],
 [b'a'],
 [b'form\xc3\xa9'],
 [b'toute'],
 [b'une'],
 [b'g\xc3\xa9n\xc3\xa9ration'],
 [b'de'],


In [14]:
for y in y_labels.take(5):  #  first 5 label elements
    print("Labels (text):", y)

Labels (text): <tf.RaggedTensor []>
Labels (text): <tf.RaggedTensor [[b'PRO:PER'],
 [b'VER:pres'],
 [b'VER:pper'],
 [b'DET:ART'],
 [b'NOM'],
 [b'PRP'],
 [b'NAM'],
 [b'DET:ART'],
 [b'NOM'],
 [b'PRP'],
 [b'NOM'],
 [b'ADJ'],
 [b'PUN'],
 [b'PRO:REL'],
 [b'PRO:PER'],
 [b'VER:subp'],
 [b'VER:pper'],
 [b'VER:infi'],
 [b'PRP'],
 [b'NUM'],
 [b'PRP'],
 [b'DET:ART'],
 [b'NOM'],
 [b'PRP'],
 [b'DET:ART'],
 [b'ADJ'],
 [b'SENT']]>
Labels (text): <tf.RaggedTensor [[b'PRP'],
 [b'NUM'],
 [b'PUN'],
 [b'PRO:PER'],
 [b'VER:pres'],
 [b'DET:ART'],
 [b'NOM'],
 [b'PRP'],
 [b'NOM'],
 [b'ADJ'],
 [b'PRP:det'],
 [b'NAM'],
 [b'PRP'],
 [b'NAM'],
 [b'PUN'],
 [b'VER:pper'],
 [b'PRO:PER'],
 [b'VER:pres'],
 [b'DET:POS'],
 [b'NOM'],
 [b'VER:pper'],
 [b'DET:ART'],
 [b'NOM'],
 [b'KON'],
 [b'NOM'],
 [b'DET:ART'],
 [b'NOM'],
 [b'PRP:det'],
 [b'NOM'],
 [b'ADJ'],
 [b'SENT']]>
Labels (text): <tf.RaggedTensor [[b'PRO:PER'],
 [b'VER:pres'],
 [b'VER:pper'],
 [b'PRO:IND'],
 [b'DET:ART'],
 [b'NOM'],
 [b'PRP'],
 [b'NOM'],
 [b'ADJ'],


### A continuer...

In [17]:


# tv = solution.get_text_vectorizer_from_config(solution.ExpeConfig("whitespace",None,1000))

In [25]:
# def preproc(x,y):
#     return tv(x),y


In [26]:
# model = utils.PerceptronModelSparseCategorical(tv, list(range(7)))

ValueError: in user code:

    File "/tmp/ipykernel_6005/711123526.py", line 2, in preproc  *
        return tv(x),y
    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/layers/preprocessing/text_vectorization.py", line 588, in _preprocess
        raise ValueError(

    ValueError: Exception encountered when calling layer 'text_vectorization' (type TextVectorization).
    
    When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, None) with rank=2
    
    Call arguments received by layer 'text_vectorization' (type TextVectorization):
      • inputs=tf.RaggedTensor(values=Tensor("RaggedFromVariant/RaggedTensorFromVariant:1", shape=(None,), dtype=string), row_splits=Tensor("RaggedFromVariant/RaggedTensorFromVariant:0", shape=(None,), dtype=int64))


In [20]:
tv.adapt(X_data)

ValueError: in user code:

    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/engine/base_preprocessing_layer.py", line 123, in adapt_step  *
        self.update_state(data)
    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/layers/preprocessing/text_vectorization.py", line 476, in update_state  **
        self._lookup_layer.update_state(self._preprocess(data))
    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/layers/preprocessing/text_vectorization.py", line 588, in _preprocess
        raise ValueError(

    ValueError: When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, None) with rank=2


In [18]:
#copy code


# tv = solution.get_text_vectorizer_from_config(solution.ExpeConfig("whitespace",None,1000))



# tv.adapt(X_text)
# ds_train.map(lambda x,y: (tv(x),y)).take(1).get_single_element()

# X_y.map(lambda x,y: (tv(x),y)).take(1).get_single_element()

# model = utils.PerceptronModelSparseCategorical(tv, list(range(7)))

# def preproc(x,y):
#     return tv(x),y(


# model.fit(ds_train.map(preproc), validation_data=ds_valid.map(preproc), epochs=10)


ValueError: in user code:

    File "/tmp/ipykernel_6005/3658695583.py", line 11, in None  *
        lambda x,y: (tv(x),y)
    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/home/shamglam/miniconda3/envs/ml/lib/python3.11/site-packages/keras/src/layers/preprocessing/text_vectorization.py", line 588, in _preprocess
        raise ValueError(

    ValueError: Exception encountered when calling layer 'text_vectorization' (type TextVectorization).
    
    When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, None) with rank=2
    
    Call arguments received by layer 'text_vectorization' (type TextVectorization):
      • inputs=tf.RaggedTensor(values=Tensor("RaggedFromVariant/RaggedTensorFromVariant:1", shape=(None,), dtype=string), row_splits=Tensor("RaggedFromVariant/RaggedTensorFromVariant:0", shape=(None,), dtype=int64))


In [28]:
# tv.vocabulary_size

<bound method TextVectorization.vocabulary_size of <keras.src.layers.preprocessing.text_vectorization.TextVectorization object at 0x7275df0bd950>>