# Projet pour le cours Sémantique computationnelle
### Fait par Mariya Borovikova

In [None]:
%%capture
!pip install sklearn-crfsuite
!pip3 install fasttext

In [None]:
!git clone https://github.com/project178/coref

In [None]:
import copy
import fasttext
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn_crfsuite
import torch
import torch.nn as nn
from torch import optim
from tqdm.auto import tqdm
from statistics import harmonic_mean

## Première partie : CRF (Champs aléatoires conditionnels)


Préparation des données

In [None]:
def get_data_for_crf(corpus="data/democrat.conll"):
  x, y = [], []
  with open(corpus) as democrat:
    for line in democrat:
          if line.startswith("#begin"): data, sentence = [], []
          elif line.startswith("#end"): continue
          elif line == "\n":
            data.append(sentence)
            sentence = []
            if len(data) == 5:
              item = copy.deepcopy([w for s in data for w in s])
              
              if len(item) < 150:
                for i, w in enumerate(item): w["index"] = str(i)
                for j, w1 in enumerate(item[:0:-1]):
                  for key in w1.keys():
                    if w1["y"] != "-1": break
                    if key.startswith("group"):
                      for w2 in item[-j-2::-1]:
                        if w1["y"] != "-1": break
                        for key1 in w2.keys():
                          if key1.startswith("group") and w1[key] == w2[key1]:
                            w1["y"] = w2["index"]
                            break
                item[0]["y"] = "-1"
                y.append([word["y"] for word in item])
                for word in item:
                  del word["y"]
                  for i in range(len(word)-5): del word["group"+str(i)]
                x.append(item)
                del item           
              del data[0]
          else:
            l = line.split()
            if sentence: sentence[-1]["nextword"] = l[3]
            elif data: data[-1][-1]["nextword"] = l[3]
            groups = []
            for group in l[-1].split("|"):
              if group[-1] == ")": groups.append(group[:-1])
            word = {"prevword" : sentence[-1]["word"] if sentence else data[-1][-1]["word"] if data else "", "word" : l[3], "pos" : l[4], "nextword" : "", "y" : "-1"}
            word.update({"group"+str(i) : group  for i, group in enumerate(groups)})
            sentence.append(word)
  return x, y

Construction du modèle

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)

In [None]:
x, y = get_data_for_crf()
crf.fit(x[:-100], y[:-100])

Résultats

In [None]:
def normalize(y):
  new_y = []
  for t in y:
    new_t = []
    for i, u in enumerate(t, 1):
      if u!="-1":
        for k in new_t:
          if int(u) in k: k.add(i)
          elif i in k: k.add(int(u))
        else: new_t.append({int(u), i})
    new_y.append(new_t)
  return new_y

In [None]:
def b_cubed(golds, preds):
  l_g, l_p = sum(len(gold) for gold in golds), sum(len(pred) for pred in preds)
  r = 0.0 if l_g == 0 else sum(len(pred&gold)**2/len(gold) for gold in golds for pred in preds)/l_g
  p = 0.0 if l_p == 0 else sum(len(pred&gold)**2/len(pred) for gold in golds for pred in preds)/l_p
  f = harmonic_mean((r, p))
  return r, p, f

In [None]:
def b_cubed_for_batch(golds, preds)
  r, p, f = 0, 0, 0
  b = [b_cubed(y1, y2) for (y1, y2) in zip(golds, preds)]
  for metrics in b:
    r += metrics[0]
    p += metrics[1]
    f += metrics[2]
  return r/len(b), p/len(b), f/len(b)

In [None]:
def normalize(y):
  new_y = []
  for t in y:
    new_t = []
    for i, u in enumerate(t, 1):
      if u!="-1":
        for k in new_t:
          if int(u) in k: k.add(i)
          elif i in k: k.add(int(u))
        else: new_t.append({int(u), i})
    new_y.append(new_t)
  return new_y

In [None]:
r, p, f = b_cubed_for_batch(normalize(y[-100:]), normalize([crf.predict(k) for k in x[-100:]]))
print(f"Rappel : {r}")
print(f"Précision : {p}")
print(f"F-micro : {f}")

Rappel : 0.5635911434020358
Précision : 0.6727939468088564
F-micro : 0.5977205882392462


#Conclusion

Ce modèle n'est pas du haute qualité. Il vaudrait mieux essayer des algorithmes plus efficaces.

## Deuxième partie : Réseau génératif

J'ai décidé de réaliser des expériences avec des réseaux de neurones et de comparer des résultats avec un modèle précédent.

Préparation des données

In [None]:
def get_base(corpus="data/democrat.conll"):
  x, y, pos = [], [], []
  with open(corpus) as democrat:
      for line in democrat:
          if line.startswith("#begin"):
              x_doc, y_doc, x_words, y_words, pos_doc = [], [], [], [], ""
              continue
          elif line.startswith("#end"):
              x.append(x_doc)
              y.append(y_doc)
              pos.append(pos_doc)
              continue
          l = line.split()
          if not(l):
              if x_words and y_words:
                  x_doc.append(x_words)
                  y_doc.append(y_words)
                  x_words, y_words = [], []
          else:
              pos_doc += l[4] + " "
              x_words.append(l[2:5]+[l[6]])
              y_words.append(l[-1])
  joblib.dump(x, "data/x_words")
  joblib.dump(y, "data/y_words")
  joblib.dump(pos, "data/pos")
  return x, y, pos

In [None]:
def get_vectors(x = "data/x_words", y="data/y_words", pos="data/pos", model='data/cc.fr.300.bin', path=True):
  if path:
    x = joblib.load(x)
    y = joblib.load(y)
  pos = joblib.load(pos)
  tfidf = TidfVectorizer().fit(pos)
  ft = fasttext.load_model(model)
  new_x = np.asarray([np.array([np.array([np.array([ft.get_word_vector(word[1]), tfidf.transform([word[2]]).toarray()[0]]) for word in sentence]) for sentence in doc]) for doc in x])
  new_y = []
  indexes = []
  for doc in y:
      new_doc = []
      for sentence in doc:
          new_sentence = []
          for word in sentence:
              new_word = []
              if word == "-" and not(indexes): new_word.append(np.array(-1))
              else:
                  words = word.split("|")
                  for index in words:
                      if index[0] == "(":
                          indexes.append(int(index[1:]))
                      elif index[-1] == ")":
                          indexes.remove(int(index[:-1]))
                          new_word.append(int(index[:-1]))
                  new_word += indexes
              new_sentence.append(np.array(new_word))
          new_doc.append(np.array(new_sentence))
      new_y.append(np.array(new_doc))
  new_y = np.asarray(new_y)
  joblib.dump(new_x, "data/x_vecs")
  joblib.dump(new_y, "data/y_vecs")
  return new_x, new_y

In [None]:
def transform_data_to_torch(x="data/x_vecs", y="data/y_vecs", path=True):
  if path:
    x = joblib.load(x)
    y = joblib.load(y)
  x_trues, y_trues = [], []
  for doc, y_doc in zip(x, y):
      for i in range(len(doc)-4):
          x_trues.append(doc[i:i+5])
          answers = []
          for s, sentence in enumerate(y_doc[i:i+5], 1):
              for w, word in enumerate(sentence, 1):
                  if not(np.array_equal(word, [-1])):
                      for r in range(len(word)):
                          answers.append((s, w, word[r]))
          new_answers = []
          chains = [z[2] for z in answers]
          passed = []
          for a, answer in enumerate(answers):
              k=0
              if answer[2] not in passed:
                  while answer[2] == chains[a+k]:
                      k+=1
                      if a+k>=len(chains): break
                  if answer[2] in chains[a+k:]: new_answers+=[i for i in answers if i[2] == answer[2]]
                  passed.append(answer[2])
          y_true = []
          for ind in {z[2] for z in new_answers}:
              y_str = ""
              for new_answer in new_answers:
                  if new_answer[2] == ind: y_str+=str(new_answer[0])+"."+str(new_answer[1])+";"
              y_true.append(y_str) 
          y_trues.append(y_true)
  joblib.dump(x_trues, "data/x_exp")
  joblib.dump(y_trues, "data/y_exp")

  return x_trues, y_trues

Génération des batches de la taille différente (selon la taille de la phrase)

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, texts, labels, maxlen, out2ind, device):
    self.texts = texts
    self.labels = labels
    self.device = device
    self.maxlen = maxlen
    self.out2ind = out2ind
  
  def __getitem__(self, item):
    texts, labels = x_trues[item], y_trues[item]
    full = []
    for chain in labels:
      new_examples = []
      for sentence in texts:
        for word in sentence:
          new_word = np.concatenate(word)
          new_examples.append(new_word)
      tens = torch.tensor(new_examples).unsqueeze(0).to(self.device)
      full.append(tens)
    new_y_trues = [(label[label[:-1].rindex(";"):][1:-1], label[:label[:-1].rindex(";")]) for label in labels]
    gold = [(torch.tensor([self.out2ind[l] for l in list(inp)]+[self.out2ind['PAD'] for _ in range(6-len(inp))], dtype=torch.long, device=self.device), 
             torch.tensor([self.out2ind[l] for l in list(y)]+[self.out2ind['EOS']]+[self.out2ind['PAD'] for _ in range(self.maxlen-len(y))], dtype=torch.long, device=self.device)) for inp, y in new_y_trues]
    return full, gold

  def __len__(self):
    return len(self.texts)

Constrution de la partie Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, device, hidden_size=300):
        super(EncoderRNN, self).__init__()
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True).to(device)

    def forward(self, input, hidden):
        output, hidden = self.gru(input, hidden)
        return output, hidden

Construction de la partie Decoder

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, output_size, hidden_size, device):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding =  nn.Embedding(output_size, hidden_size).to(device)
        self.start_embedding = nn.Embedding(output_size, 20).to(device)
        self.gru = nn.GRU(hidden_size+120, hidden_size).to(device)
        self.out = nn.Linear(hidden_size, output_size).to(device)
        
    def forward(self, prevsym, starting, hidden, *params):
        st_input = torch.flatten(self.start_embedding(starting)).view(1,1,120)
        input = torch.cat((self.embedding(prevsym), st_input), 2)
        output, hidden = self.gru(input, hidden)
        output = self.out(output[:,-1])
        return output, hidden

Entrainement

In [None]:
def train(data, encoder, decoder, hidden_size, maxlen, iters=1, learning_rate=0.01):
        print("Training...")
        encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss(ignore_index=15)
        for iter in range(iters):
            t = tqdm(range(len(data))) 
            for idx in t:
                if idx < 7000:
                  continue
                loss = 0
                b_sentence, b_gold = data[idx]
                for sentence, gold in zip(b_sentence, b_gold): 
                    enc_hidden = torch.zeros(1, 1, decoder.hidden_size, dtype=torch.float32, device=device)
                    sentence = sentence.type(torch.float32)
                    enc_outputs, enc_hidden = encoder(sentence, enc_hidden)
                    dec_input = torch.tensor([[out2ind['SOS']]], dtype=torch.long, device=device)
                    dec_hidden = enc_hidden
                    word = []
                    for symbol in range(maxlen):
                        dec_output, dec_hidden = decoder(dec_input, gold[0], dec_hidden, enc_outputs)
                        values, ids = torch.max(dec_output, 1)
                        dec_input = ids.view(-1, 1)
                        loss += criterion(dec_output, gold[1][symbol].view(-1))
                        word.append(ind2out[dec_input.item()])
                        if dec_input.item() == out2ind['EOS']:
                            break
                if idx%1000==0:
                  print(f'\n{idx}, new', "".join([ind2out[x.item()] for x in gold[1]]).replace('PAD', ''))
                  print("".join(word))
                  torch.save(enc, 'models/new_encoder.pt')
                  torch.save(dec, 'models/new_decoder.pt')
                try:
                  loss = loss/len(b_sentence)
                except ZeroDivisionError:
                  loss = loss
                if loss == 0:
                  continue
                loss.backward()
                encoder_optimizer.step()
                decoder_optimizer.step()
                encoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()

                t.set_description(f"loss: {round(float(loss), 3)}")
                t.refresh()


In [None]:
x_base, y_base, pos = get_base()
new_x, new_y = get_vectors(x_base, y_base, pos, path=False)
x, y = transform_data_to_torch(new_x, new_y, path=False)
del x_base, new_x, y_base, new_y, pos
out2ind = {str(key):key for key in range(10)}
out2ind.update({'.':10, ',':11, ';':12, 'SOS': 13, 'EOS':14, 'PAD':15, '_':16})
ind2out = {key:value for value, key in out2ind.items()}
ds = Dataset(x_trues, y_trues, maxlen, out2ind, device)

Initialization du modèle (regardez l'architecture)

In [None]:
enc = EncoderRNN(device, 383)
dec = DecoderRNN(len(out2ind), 383, device)

In [None]:
enc.train()

EncoderRNN(
  (gru): GRU(383, 383, batch_first=True)
)

In [None]:
dec.train()

DecoderRNN(
  (embedding): Embedding(17, 383)
  (start_embedding): Embedding(17, 20)
  (gru): GRU(503, 383)
  (out): Linear(in_features=383, out_features=17, bias=True)
)

In [None]:
train(ds, enc, dec, 383, 100)

Training...


HBox(children=(FloatProgress(value=0.0, max=25188.0), HTML(value='')))


0, new 5.24;5.25EOS
5.12;.;EOS

1000, new 1.15;4.1;4.9EOS
5.25;1.2;1.;4.11;4;..4;1;1.2;1.141;;1;;.522;.1;4;1;;.;4;11;;1;;1;;1;;1;;1;;1;;1;;1;;1;;1;;1;;1;;1;;1

2000, new 2.8;2.9;2.10;2.11;2.12;2.13EOS
2.12;.11.11;2..;;.12..;.12..12..12.11.11.11..11..11..11..1;..1;..1;..1..11..1..1..1..1..1..1..1..1..

3000, new 2.1;2.2;3.5;3.6;5.52;5.53;5.60EOS
5.11;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.;1.

4000, new 2.2;2.3;2.4;2.11;2.15;4.2;4.3;4.4;4.13;4.19;4.30;5.1;5.22;5.27;5.45EOS
5.2EOS

5000, new 4.21;4.25;4.35;4.38;4.41;4.46;5.4;5.5;5.18EOS
5.3;515;5..;;51152..;.1511;..............................................3.;.155145..3;...1;........

6000, new 3.32;3.33EOS
3.33;33.1333333..33;3333333333333333333333333;33333333333333333333333333333333333333333333.33;333333

7000, new 5.56EOS
5.45;55.55.5;55.5;55.113;;..555.555.55455.;555.555.555;5.555.55;55.555.55;55.555.55;55.55;55.55;55.5

8000, new 1.130;1.131;1.132;1.133;1.134EOS
1.1101.11.11111

#Conclusion
Avec chaque époque, le réseau de neuronnes semble de générer des réponses plus proches des souhaitées. Pourtant, c'est très loin d'être acceptable.

Visiblement, des algorithmes de classification sont plus appropriés pour ce tâche. Cependant, il n'est pas possible de comparer ce modèle avec le CRF, car des outputs ne sont pas toujours facilement interprétables.

Sans doute, il faudrait continuer des expériences, car des réseaux seq2seq apprennent lentement. Faute des ressources et de temps le modèle n'a même pas traité tous le jeu de données pour un fois. Ainsi, il est difficile de faire une conclusion finale, mais selon des chiffres qu'on voit, un CRF marche mieux. 