# Traitement Automatique du Langage Naturel 

Notebook lié à l'article 

In [3]:
import spacy
from spacy import displacy

nlp = spacy.load("fr_core_news_sm")

In [4]:
test = "Bouygues a eu une coupure de réseau à Marseille."

## 1. Tokenisation

In [5]:
def return_token(sentence):
    doc = nlp(sentence)
    return [X.text for X in doc]

In [6]:
return_token(test)

['Bouygues',
 'a',
 'eu',
 'une',
 'coupure',
 'de',
 'réseau',
 'à',
 'Marseille',
 '.']

In [8]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('french'))

clean_words = []
for token in return_token(test):
    if token not in stopWords:
        clean_words.append(token)

In [9]:
print(stopWords)

{'j', 'fûtes', 'c', 'étée', 'ta', 'eus', 'eût', 'avons', 'aurions', 'étante', 'par', 'ayants', 'ait', 'auraient', 'aie', 'auront', 'il', 'ai', 'nous', 'et', 'lui', 'eussiez', 'fus', 'mon', 'ses', 'étiez', 'qu', 'aurai', 'serais', 'fussent', 'suis', 'étés', 'seriez', 'es', 'avions', 'fussiez', 'eurent', 'nos', 'étant', 'eux', 'étantes', 's', 'son', 'une', 't', 'étais', 'en', 'aient', 'as', 'ayante', 'sont', 'eut', 'aurez', 'aurais', 'du', 'était', 'me', 'te', 'seras', 'fussions', 'tu', 'fûmes', 'eûmes', 'eûtes', 'fusse', 'aies', 'l', 'aurons', 'mes', 'même', 'ils', 'au', 'sommes', 'êtes', 'serions', 'votre', 'on', 'aurait', 'furent', 'vos', 'ayantes', 'soit', 'se', 'fût', 'qui', 'eusses', 'le', 'eussent', 'eu', 'ayant', 'la', 'ma', 'les', 'y', 'eue', 'sa', 'eusse', 'étaient', 'auras', 'étées', 'd', 'ne', 'ont', 'fut', 'des', 'seront', 'n', 'aux', 'soyons', 'été', 'dans', 'sois', 'serons', 'ce', 'ou', 'un', 'seraient', 'auriez', 'aviez', 'fusses', 'serait', 'soyez', 'aura', 'tes', 'de', 

In [10]:
print(clean_words)

['Bouygues', 'a', 'coupure', 'réseau', 'Marseille', '.']


## 2. Tokenisation de phrases

In [11]:
def return_token_sent(sentence):
    doc = nlp(sentence)
    return [X.text for X in doc.sents]

In [12]:
return_token_sent("Bouygues a eu une coupure de réseau à Marseille. La panne a affecté 300.000 utilisateurs.")

['Bouygues a eu une coupure de réseau à Marseille.',
 'La panne a affecté 300.000 utilisateurs.']

## 3. Lemmatisation

In [17]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

def return_stem(sentence):
    doc = nlp(sentence)
    return [stemmer.stem(X.text) for X in doc]

return_stem(test)

['bouygu', 'a', 'eu', 'une', 'coupur', 'de', 'réseau', 'à', 'marseil', '.']

## 4. Reconnaissance d'entités nommées (NER)

In [18]:
def return_NER(sentence):
    doc = nlp(sentence)
    return [(X.text, X.label_) for X in doc.ents]

In [19]:
return_NER(test)

[('Bouygues', 'ORG'), ('Marseille', 'LOC')]

In [20]:
doc = nlp(test)
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["ORG"], "colors": colors}

displacy.serve(doc, style="ent", options=options)


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## 5. Part-of-Speech Tagging (POS)

In [21]:
def return_POS(sentence):
    doc = nlp(sentence)
    return [(X, X.pos_) for X in doc]

In [22]:
return_POS(test)

[(Bouygues, 'PROPN'),
 (a, 'AUX'),
 (eu, 'VERB'),
 (une, 'DET'),
 (coupure, 'NOUN'),
 (de, 'ADP'),
 (réseau, 'NOUN'),
 (à, 'ADP'),
 (Marseille, 'PROPN'),
 (., 'PUNCT')]

In [23]:
from spacy import displacy

doc = nlp(test)
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## 6. Embedding par mot

In [24]:
import numpy as np

def return_word_embedding(sentence):
    doc = nlp(sentence)
    return [(X.tensor) for X in doc]

In [25]:
return_word_embedding(test)

[array([-1.6346099 , -0.5649567 , -1.9408679 , -2.813146  , -0.10111822,
        -3.530174  ,  2.1561453 , -4.919579  ,  0.32698557,  1.0308661 ,
         2.2153058 ,  0.7172387 , -8.715832  , -1.0822319 , -0.29062498,
        -1.5783455 , -2.3651779 , -2.728358  , -0.86396784,  0.92594165,
        -2.9402206 ,  9.626809  ,  3.6982553 ,  2.3619318 , -3.4923077 ,
        -3.007491  ,  0.6268473 ,  0.74501777,  0.10165483, -2.073378  ,
        -6.022312  , -3.268891  ,  1.3297336 , -0.02227807,  6.210668  ,
         5.510391  , -3.3060744 ,  6.059071  ,  8.359106  ,  1.2666026 ,
        -2.4710221 ,  1.2002287 , -2.1460958 ,  1.1080242 , -0.47597623,
        -3.48144   , -0.3286002 , -2.6209486 ,  2.7624912 , -3.1821744 ,
         0.70762277, -5.4075885 , -0.43672955, -1.1972885 ,  1.9135561 ,
        -4.791294  ,  1.8234208 ,  2.8430328 ,  2.995939  , -3.0965562 ,
        -0.8349128 ,  0.7072923 , -3.1008275 ,  2.5546198 ,  5.942214  ,
        -1.4530498 ,  2.4120169 , -2.190457  , -1.3

## 7. Similarités entre phrases

In [26]:
def return_mean_embedding(sentence):
    doc = nlp(sentence)
    return np.mean([(X.vector) for X in doc], axis=0)

In [27]:
test_2 = "Le réseau sera bientot rétabli à Marseille"
test_3 = "La panne réseau affecte plusieurs utilisateurs de l'opérateur"
test_4 = "Il fait 18 degrés ici"

In [28]:
np.linalg.norm(return_mean_embedding(test)-return_mean_embedding(test_2))

11.860769

In [29]:
np.linalg.norm(return_mean_embedding(test)-return_mean_embedding(test_3))

12.083751

In [30]:
np.linalg.norm(return_mean_embedding(test)-return_mean_embedding(test_4))

19.62302

## 8. Transformers pour prédire la prochaine phrase

In [31]:
import torch

In [32]:
from transformers import BertTokenizer,BertForNextSentencePrediction

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-multilingual-cased')
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [33]:
text = "Comment ça va ? Bien merci, un peu stressé avant l'examen"

tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [34]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [35]:
predictions = model(tokens_tensor, segments_tensors)

In [36]:
import numpy as np
if np.argmax(predictions) == 0:
    print("Suite")
else:
    print("Pas la suite")

Suite
