# NLP

Natural Language Processing (NLP) - TAL (Traitement Autmoatique des Langues).

- package spaCy
- package nltk

In [1]:
import spacy

## Creation de la classe

In [53]:
# Ici on crée notre classe principale

class NLPMachine:

  # Déclaration de la fonction d'initialisation:
  def __init__(self):
    self.text = None
    self.doc = None
    self.words = []
    self.sentences = []
    self.lemmas = []
    self.pos = []
    self.nes = []

  def read_text(self, path):
    """Cette focntion permet de lire le contenu d'un fichier texte et mettre a jour l'attribut text"""
    with open(path) as f:
      self.text = f.read()
      model = spacy.load("en_core_web_sm")
      self.doc = model(self.text)

  def tokenize(self):
    """Utilise spaCy pour tokeniser le texte en mots et en phrases."""
    self.words = []
    self.sentences = []

    for token in self.doc:
      self.words.append(token.text)

    for sentence in self.doc.sents:
      self.sentences.append(sentence.text)
    return self.words, self.sentences

  def get_lemmas(self):
    self.lemmas = []
    for token in self.doc:
      if token.is_stop == False and token.is_alpha:
        self.lemmas.append(token.lemma_)
    return self.lemmas

  def get_pos(self):
    self.pos = []

    for token in self.doc:
      if token.is_stop == False and token.is_alpha:
        self.pos.append({
            "mot" : token.text,
            "pos" : token.pos_,
            "tag" : token.tag_
        })
    return self.pos

  def get_ne(self):
    self.nes = []

    for entitiy in self.doc.ents:
      self.nes.append({
          "mot" : entitiy.text,
          "label" : entitiy.label_
          })
    return self.nes


In [54]:
# Ici on défine le chemin vers notre fichier que l'on souhaite traiter.
import os
chemin_fichier_text = os.path.join(os.getcwd(), "sample_data", "README.md")

# Ici on crée une instance de la classe et ion lit le fichier
machine = NLPMachine()
machine.read_text(chemin_fichier_text)

# Ici on imprime le contenu textuel.
# print(machine.text)

# Tokenisation

In [10]:
split = machine.text.split(" ")
print(split)

['This', 'directory', 'includes', 'a', 'few', 'sample', 'datasets', 'to', 'get', 'you', 'started.\n\n*', '', '', '`california_housing_data*.csv`', 'is', 'California', 'housing', 'data', 'from', 'the', '1990', 'US\n', '', '', '', 'Census;', 'more', 'information', 'is', 'available', 'at:\n', '', '', '', 'https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub\n\n*', '', '', '`mnist_*.csv`', 'is', 'a', 'small', 'sample', 'of', 'the\n', '', '', '', '[MNIST', 'database](https://en.wikipedia.org/wiki/MNIST_database),', 'which', 'is\n', '', '', '', 'described', 'at:', 'http://yann.lecun.com/exdb/mnist/\n\n*', '', '', '`anscombe.json`', 'contains', 'a', 'copy', 'of\n', '', '', '', "[Anscombe's", 'quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet);', 'it\n', '', '', '', 'was', 'originally', 'described', 'in\n\n', '', '', '', 'Anscombe,', 'F.', 'J.', '(1973).', "'Graphs", 'in', 'Statistical', "Analysis'.", 'America

In [25]:
model = spacy.load("en_core_web_sm")

doc = model(machine.text)

print(type(doc))

words = []
sentences = []

for token in doc:
  words.append(token.text)

for sentence in doc.sents:
  sentences.append(sentence.text)

print(words)
print(sentences)

<class 'spacy.tokens.doc.Doc'>
['This', 'directory', 'includes', 'a', 'few', 'sample', 'datasets', 'to', 'get', 'you', 'started', '.', '\n\n', '*', '  ', '`', 'california_housing_data*.csv', '`', 'is', 'California', 'housing', 'data', 'from', 'the', '1990', 'US', '\n    ', 'Census', ';', 'more', 'information', 'is', 'available', 'at', ':', '\n    ', 'https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub', '\n\n', '*', '  ', '`', 'mnist_*.csv', '`', 'is', 'a', 'small', 'sample', 'of', 'the', '\n    ', '[', 'MNIST', 'database](https://en.wikipedia.org', '/', 'wiki', '/', 'MNIST_database', ')', ',', 'which', 'is', '\n    ', 'described', 'at', ':', 'http://yann.lecun.com/exdb/mnist/', '\n\n', '*', '  ', '`', 'anscombe.json', '`', 'contains', 'a', 'copy', 'of', '\n    ', '[', 'Anscombe', "'s", 'quartet](https://en.wikipedia.org', '/', 'wiki', '/', 'Anscombe%27s_quartet', ')', ';', 'it', '\n    ', 'was', 'originally', 

In [33]:
machine.tokenize()

print(machine.words)
print(machine.sentences)

['This', 'directory', 'includes', 'a', 'few', 'sample', 'datasets', 'to', 'get', 'you', 'started', '.', '\n\n', '*', '  ', '`', 'california_housing_data*.csv', '`', 'is', 'California', 'housing', 'data', 'from', 'the', '1990', 'US', '\n    ', 'Census', ';', 'more', 'information', 'is', 'available', 'at', ':', '\n    ', 'https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub', '\n\n', '*', '  ', '`', 'mnist_*.csv', '`', 'is', 'a', 'small', 'sample', 'of', 'the', '\n    ', '[', 'MNIST', 'database](https://en.wikipedia.org', '/', 'wiki', '/', 'MNIST_database', ')', ',', 'which', 'is', '\n    ', 'described', 'at', ':', 'http://yann.lecun.com/exdb/mnist/', '\n\n', '*', '  ', '`', 'anscombe.json', '`', 'contains', 'a', 'copy', 'of', '\n    ', '[', 'Anscombe', "'s", 'quartet](https://en.wikipedia.org', '/', 'wiki', '/', 'Anscombe%27s_quartet', ')', ';', 'it', '\n    ', 'was', 'originally', 'described', 'in', '\n\n    ', 

## Lemmatization et stop words

In [37]:
lemmas = []

for token in machine.doc:
  if token.is_stop == False and token.is_alpha:
    lemmas.append(token.lemma_)

print(lemmas)

['directory', 'include', 'sample', 'dataset', 'start', 'California', 'housing', 'datum', 'Census', 'information', 'available', 'small', 'sample', 'MNIST', 'wiki', 'describe', 'contain', 'copy', 'Anscombe', 'wiki', 'originally', 'describe', 'Anscombe', 'graph', 'Statistical', 'Analysis', 'american', 'Statistician', 'JSTOR', 'copy', 'prepare', 'altair', 'viz']


In [42]:
lemmas = machine.get_lemmas()

print(lemmas)

None


## POS tagging
POS = Parts of speech

In [47]:
pos = []

for token in machine.doc:
  if token.is_stop == False and token.is_alpha:
    pos.append({
        "mot" : token.text,
        "pos" : token.pos_,
        "tag" : token.tag_
    })

print(pos)

[{'mot': 'directory', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'includes', 'pos': 'VERB', 'tag': 'VBZ'}, {'mot': 'sample', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'datasets', 'pos': 'NOUN', 'tag': 'NNS'}, {'mot': 'started', 'pos': 'VERB', 'tag': 'VBN'}, {'mot': 'California', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'housing', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'data', 'pos': 'NOUN', 'tag': 'NNS'}, {'mot': 'Census', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'information', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'available', 'pos': 'ADJ', 'tag': 'JJ'}, {'mot': 'small', 'pos': 'ADJ', 'tag': 'JJ'}, {'mot': 'sample', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'MNIST', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'wiki', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'described', 'pos': 'VERB', 'tag': 'VBN'}, {'mot': 'contains', 'pos': 'VERB', 'tag': 'VBZ'}, {'mot': 'copy', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'Anscombe', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'wiki', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'originally', 'po

In [50]:
pos = machine.get_pos()

print(pos)

[{'mot': 'directory', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'includes', 'pos': 'VERB', 'tag': 'VBZ'}, {'mot': 'sample', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'datasets', 'pos': 'NOUN', 'tag': 'NNS'}, {'mot': 'started', 'pos': 'VERB', 'tag': 'VBN'}, {'mot': 'California', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'housing', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'data', 'pos': 'NOUN', 'tag': 'NNS'}, {'mot': 'Census', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'information', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'available', 'pos': 'ADJ', 'tag': 'JJ'}, {'mot': 'small', 'pos': 'ADJ', 'tag': 'JJ'}, {'mot': 'sample', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'MNIST', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'wiki', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'described', 'pos': 'VERB', 'tag': 'VBN'}, {'mot': 'contains', 'pos': 'VERB', 'tag': 'VBZ'}, {'mot': 'copy', 'pos': 'NOUN', 'tag': 'NN'}, {'mot': 'Anscombe', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'wiki', 'pos': 'PROPN', 'tag': 'NNP'}, {'mot': 'originally', 'po

## NER
Named Entity Recognition

In [52]:
nes = []

for entitiy in machine.doc.ents:
  nes.append({
      "mot" : entitiy.text,
      "label" : entitiy.label_
      })

print(nes)

[{'mot': 'California', 'label': 'GPE'}, {'mot': '1990', 'label': 'DATE'}, {'mot': 'US\n    Census', 'label': 'EVENT'}, {'mot': 'mnist_*.csv', 'label': 'PRODUCT'}, {'mot': "Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet", 'label': 'WORK_OF_ART'}, {'mot': 'Anscombe', 'label': 'PERSON'}, {'mot': 'F. J. (', 'label': 'PERSON'}, {'mot': '1973', 'label': 'DATE'}, {'mot': 'Graphs', 'label': 'PERSON'}, {'mot': 'American', 'label': 'NORP'}, {'mot': '27', 'label': 'CARDINAL'}, {'mot': '1', 'label': 'CARDINAL'}, {'mot': '17-21', 'label': 'DATE'}, {'mot': 'library](https://github.com', 'label': 'ORG'}]


In [55]:
nes = machine.get_ne()
print(nes)

[{'mot': 'California', 'label': 'GPE'}, {'mot': '1990', 'label': 'DATE'}, {'mot': 'US\n    Census', 'label': 'EVENT'}, {'mot': 'mnist_*.csv', 'label': 'PRODUCT'}, {'mot': "Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet", 'label': 'WORK_OF_ART'}, {'mot': 'Anscombe', 'label': 'PERSON'}, {'mot': 'F. J. (', 'label': 'PERSON'}, {'mot': '1973', 'label': 'DATE'}, {'mot': 'Graphs', 'label': 'PERSON'}, {'mot': 'American', 'label': 'NORP'}, {'mot': '27', 'label': 'CARDINAL'}, {'mot': '1', 'label': 'CARDINAL'}, {'mot': '17-21', 'label': 'DATE'}, {'mot': 'library](https://github.com', 'label': 'ORG'}]


## Sentiment analysis

In [67]:
path = os.path.join(os.getcwd(), "article.txt")

machine2 = NLPMachine()
machine2.read_text(path)

print(machine2.text)

Greek fire was an incendiary weapon system used by the Eastern Roman Empire from the seventh to the fourteenth centuries. The recipe for Greek fire was a closely-guarded state secret; historians have variously speculated that it was based on saltpeter, sulfur, or quicklime, but most modern scholars agree that it was based on petroleum mixed with resins, comparable in composition to modern napalm. Byzantine sailors would toss grenades loaded with Greek fire onto enemy ships or spray it from tubes. Its ability to burn on water made it an effective and destructive naval incendiary weapon, and rival powers tried unsuccessfully to copy the material.


In [57]:
from textblob import TextBlob

In [68]:
blob = TextBlob(machine2.text)

print(blob.sentiment.polarity)  #-1 et 1
print(blob.sentiment.subjectivity) # 0 1

0.04545454545454546
0.35909090909090907


## Topic Modelling

In [63]:
from gensim import corpora
from gensim.models import LdaModel

In [69]:
tokens_filtered = []
for token in machine2.doc:
  if token.is_stop == False and token.is_alpha:
    tokens_filtered.append(token.text)

print(tokens_filtered)

['Greek', 'fire', 'incendiary', 'weapon', 'system', 'Eastern', 'Roman', 'Empire', 'seventh', 'fourteenth', 'centuries', 'recipe', 'Greek', 'fire', 'closely', 'guarded', 'state', 'secret', 'historians', 'variously', 'speculated', 'based', 'saltpeter', 'sulfur', 'quicklime', 'modern', 'scholars', 'agree', 'based', 'petroleum', 'mixed', 'resins', 'comparable', 'composition', 'modern', 'napalm', 'Byzantine', 'sailors', 'toss', 'grenades', 'loaded', 'Greek', 'fire', 'enemy', 'ships', 'spray', 'tubes', 'ability', 'burn', 'water', 'effective', 'destructive', 'naval', 'incendiary', 'weapon', 'rival', 'powers', 'tried', 'unsuccessfully', 'copy', 'material']


In [70]:
dictionary = corpora.Dictionary([tokens_filtered])

corpus = []
for item in [tokens_filtered]:
  corpus.append(dictionary.doc2bow(item))

In [71]:
# LDA Model
lda_model = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10)

# Display Topics
topics = lda_model.print_topics()
print("Topics:")
for topic in topics:
    print(topic)

Topics:
(0, '0.019*"Greek" + 0.019*"fire" + 0.019*"based" + 0.019*"incendiary" + 0.019*"modern" + 0.019*"weapon" + 0.019*"saltpeter" + 0.019*"ships" + 0.019*"tried" + 0.019*"Eastern"')
(1, '0.040*"fire" + 0.040*"Greek" + 0.029*"weapon" + 0.029*"modern" + 0.029*"incendiary" + 0.029*"based" + 0.017*"unsuccessfully" + 0.017*"loaded" + 0.017*"Byzantine" + 0.017*"burn"')
