In [None]:
import keras
import tensorflow as tf
import utils_TP4 as utils
import solution_TP4 as solution

# Utilisation de Dataset (en gardant les anciens modèle et TextVectorizationLayer)

### Récupération et affichage d'une instance dans le corpus pour vérification

In [None]:
ds_train, ds_valid = keras.utils.text_dataset_from_directory(
    "Corpus",
    seed=42,
    validation_split=0.3,
    subset='both')

In [None]:
un_elem = ds_train.unbatch().take(1).get_single_element()
un_elem   # equivalent of tst_ds


### Vectorisation du corpus

On `adapt()` le text_vectorizer en laissant de côté les `y` avec la fonction lambda

In [None]:
tv = solution.get_text_vectorizer_from_config(solution.ExpeConfig("whitespace",None,1000))

In [None]:
tv.adapt(ds_train.map(lambda x,y: x))

#### Vérification des structures de données obtenues (on vérifie les types et les shapes)

In [None]:
ds_train.map(lambda x,y: (tv(x),y)).take(1).get_single_element()

### Création et entraînement du modèle
(et fonction de preprocessing qui peut remplacer la lambda)

In [None]:
model = utils.PerceptronModelSparseCategorical(tv, list(range(7)))

In [None]:
def preproc(x,y):
    return tv(x),y

In [None]:
model.fit(ds_train.map(preproc), validation_data=ds_valid.map(preproc), epochs=10)

In [None]:
tv.vocabulary_size

In [None]:
model.summary()

# Utilisation de plongements (Embeddings)

In [None]:
tv_int = text_vectorizer = keras.layers.TextVectorization(
    max_tokens=3000, # taille du vocabulaire conservé
    output_sequence_length=100, # taille des séquences (tronquées ou en ajoutant du padding)
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=None,
    output_mode="int") # changement : "int" au lieu de "count" pour un encodage un token -> un entier

In [None]:
tv_int.adapt(ds_train.map(lambda x,y:x))

In [None]:
one_x = ds_train.unbatch().map(lambda x,y:x).map(tv_int).take(1).get_single_element()
one_x

### On peut vérifier qu'on est capable de réencoder un document pour voir si tout se passe comme prévu

In [None]:
vocab = tv_int.get_vocabulary()

In [None]:
[vocab[i] for i in one_x]

# Une couche d'embeddings

In [None]:
embeddings = keras.layers.Embedding(
    tv_int.vocabulary_size(),
    3, # longueur des vecteurs
    mask_zero=True # important si padding
)
embeddings(one_x)

# Un modèle avec une couche d'embeddings

In [None]:
def build_model(tv, emb_dim, nb_classes):
    inputs = keras.layers.Input(shape=(100,))
    embeddings = keras.layers.Embedding(
        tv.vocabulary_size(),
        emb_dim,
        mask_zero=True,
        name="emb"
    )(inputs)
    embeddings = keras.layers.Dropout(rate=0.2)(embeddings)
    pooling = keras.layers.GlobalMaxPooling1D()(embeddings)
    classif = keras.layers.Dense(nb_classes, activation="softmax", use_bias=True)(pooling)
    model = keras.Model(inputs=inputs, outputs=classif)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])
    return model
    

In [None]:
model = build_model(tv_int, 300, 7)

In [None]:
model.summary()

In [None]:
def preproc_int(x,y):
    return tv_int(x),y

In [None]:
model.fit(ds_train.map(preproc_int),  validation_data=ds_valid.map(preproc_int), epochs=40)

# visualisation

création de fichiers tsv prêts à être chargés sur https://projector.tensorflow.org/ 
(cf le code dans utils_TP5.py pour l'extraction des poids qui correspondent aux vecteurs)

In [None]:
from utils_TP5 import write_vectors_proj_format

In [None]:
write_vectors_proj_format(model, tv_int)

# Debut TP 7 

In [None]:
### TP 7 
# Corpus avec étiquette morphosyntaxique

import keras
import tensorflow as tf
from tensorflow.data import TextLineDataset

ds=TextLineDataset("aij-wikiner-fr-wp2")

ds  # entire dataset object  with token|POS tag | BIO tag

In [None]:
tst_ds = ds.skip(1).take(1).get_single_element()
tst_ds

In [None]:
def tensor_split(tensor):
    # return tf.strings.split(tensor, sep="|", maxsplit=-1, name=None)
    t = tf.strings.split(tensor)
    # return t
    X,y = tf.strings.split(t, sep="|", maxsplit=-1, name=None)[:, :1], tf.strings.split(t, sep="|", maxsplit=-1, name=None)[:, 1:2]
    # print(X)
    # print(y)
    return X,y


    
    

In [None]:
# X, y  # ok

In [None]:
# liste = [ ]
l1= []
for item in ds:
    l1.append(item)

print(f'(Taille du corpus : {len(l1)}')


In [None]:
# liste[:5]
l1[:5]

In [None]:
# je prends les 1000 premiers sinon hyper long
X_train=[]
y_train=[]
X_valid=[]
y_valid=[]

def create_tensors(datalist):
    for instance in datalist:    
        X_ds, y_ds = tensor_split(item)
        X.extend(X_ds)
        y.extend(y_ds)

        X_tensor = tf.convert_to_tensor(X)
        y_tensor = tf.convert_to_tensor(y)


    return X_tensor, y_tensor
    


    

In [None]:
X_train, y_train = create_tensors(l1[:1000])
X_valid, y_valid = create_tensors(l1[1000:1200])

In [None]:
# vérif
for x_sample, y_sample in zip(X_train, y_train):
    print(x_sample, y_sample)


In [None]:
print(type(X)), print(type(y))

In [None]:
#copy code

# ds_train, ds_valid = keras.utils.text_dataset_from_directory(
#     "Corpus",
#     seed=42,
#     validation_split=0.3,
#     subset='both')

tv = solution.get_text_vectorizer_from_config(solution.ExpeConfig("whitespace",None,1000))
# tv.adapt(ds_train.map(lambda x,y: x))
tv.adapt(X_tensor)
# ds_train.map(lambda x,y: (tv(x),y)).take(1).get_single_element()

# model = utils.PerceptronModelSparseCategorical(tv, list(range(7)))

# def preproc(x,y):
#     return tv(x),y


# model.fit(ds_train.map(preproc), validation_data=ds_valid.map(preproc), epochs=10)


In [None]:
tv.vocabulary_size

In [None]:

model = utils.PerceptronModelSparseCategorical(tv, list(range(7)))

def preproc(x,y):
    return tv(x),y


model.fit(X_tensor.map(preproc), validation_data=ds_valid.map(preproc), epochs=10)
