# IND6212 - Exploration de données industrielles
# NLP

Bruno Agard

Département de Mathématiques et de génie Industriel

École Polytechnique de Montréal

# 1 er 45 minutes préparation des textes

In [1]:
#pip install nltk

In [2]:
# Natural Language Toolkit
import nltk
import pandas as pd
import numpy as np


# Analyse d'une phrase

In [3]:
sentence = """At eight o'clock, on Thursday morning, the great Arthur didn't feel VERY good."""

In [4]:
len(sentence)

78

##   Tokenisation

### Couper les mots

In [5]:
words=nltk.word_tokenize(sentence)
print(words)
len(words)

['At', 'eight', "o'clock", ',', 'on', 'Thursday', 'morning', ',', 'the', 'great', 'Arthur', 'did', "n't", 'feel', 'VERY', 'good', '.']


17

### retirer la ponctuation

In [6]:
words= [w for w in words if w.isalnum()] # garde seulement if contain alphanumeric characters
print(words)
len(words)

['At', 'eight', 'on', 'Thursday', 'morning', 'the', 'great', 'Arthur', 'did', 'feel', 'VERY', 'good']


12

In [7]:
# o'clock, n't !!

### passer en minuscules

In [8]:
words = [w.lower() for w in words]
print(words)

['at', 'eight', 'on', 'thursday', 'morning', 'the', 'great', 'arthur', 'did', 'feel', 'very', 'good']


In [9]:
# !!!!!
# Arthur, UNESCO, ...

### Stopwords

In [10]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)
len(words)

['eight', 'thursday', 'morning', 'great', 'arthur', 'feel', 'good']


7

### Rammener les mots à leur racine 

In [12]:
from nltk.stem.porter import PorterStemmer
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['eight', 'thursday', 'morn', 'great', 'arthur', 'feel', 'good']


In [13]:
from nltk.stem import LancasterStemmer
# Reduce words to their stems
stemmed = [LancasterStemmer().stem(w) for w in words]
print(stemmed)

['eight', 'thursday', 'morn', 'gre', 'arth', 'feel', 'good']


In [14]:
from nltk.stem.wordnet import WordNetLemmatizer
# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['eight', 'thursday', 'morning', 'great', 'arthur', 'feel', 'good']


# Analyse d'un texte

In [15]:
text = """At eight o'clock, on Thursday morning, the great Arthur didn't feel VERY good.
          The following morning, at nine, Arthur felt better.
          A dog run in the street."""
len(text)

175

In [16]:
words_in_text = nltk.word_tokenize(text)
print(words_in_text)
len(words_in_text)

['At', 'eight', "o'clock", ',', 'on', 'Thursday', 'morning', ',', 'the', 'great', 'Arthur', 'did', "n't", 'feel', 'VERY', 'good', '.', 'The', 'following', 'morning', ',', 'at', 'nine', ',', 'Arthur', 'felt', 'better', '.', 'A', 'dog', 'run', 'in', 'the', 'street', '.']


35

In [17]:
# ponctuation
words_in_text = [w for w in words_in_text if w.isalnum()]
print(words_in_text)
len(words_in_text)

['At', 'eight', 'on', 'Thursday', 'morning', 'the', 'great', 'Arthur', 'did', 'feel', 'VERY', 'good', 'The', 'following', 'morning', 'at', 'nine', 'Arthur', 'felt', 'better', 'A', 'dog', 'run', 'in', 'the', 'street']


26

In [18]:
freqs_in_text = nltk.FreqDist(words_in_text)
freqs_in_text

FreqDist({'morning': 2, 'the': 2, 'Arthur': 2, 'At': 1, 'eight': 1, 'on': 1, 'Thursday': 1, 'great': 1, 'did': 1, 'feel': 1, ...})

In [19]:
# majuscules : The/the
words_in_text = [w.lower() for w in words_in_text]
freqs_in_text = nltk.FreqDist(words_in_text)
freqs_in_text

FreqDist({'the': 3, 'at': 2, 'morning': 2, 'arthur': 2, 'eight': 1, 'on': 1, 'thursday': 1, 'great': 1, 'did': 1, 'feel': 1, ...})

### Couper les Phrases

In [20]:
sentence = nltk.sent_tokenize(text)
print(sentence)
len(sentence)

["At eight o'clock, on Thursday morning, the great Arthur didn't feel VERY good.", 'The following morning, at nine, Arthur felt better.', 'A dog run in the street.']


3

In [21]:
print(sentence[0])
len(sentence[0])

At eight o'clock, on Thursday morning, the great Arthur didn't feel VERY good.


78

In [22]:
# majuscules / minuscules
sentence = [w.lower() for w in sentence]
# mots par phrase
words_in_sentence = [nltk.word_tokenize(s) for s in sentence]
print(words_in_sentence)

[['at', 'eight', "o'clock", ',', 'on', 'thursday', 'morning', ',', 'the', 'great', 'arthur', 'did', "n't", 'feel', 'very', 'good', '.'], ['the', 'following', 'morning', ',', 'at', 'nine', ',', 'arthur', 'felt', 'better', '.'], ['a', 'dog', 'run', 'in', 'the', 'street', '.']]


In [23]:
# ponctuation
words_in_sentence = [[w for w in words if w.isalnum()] for words in words_in_sentence]
print(words_in_sentence)  

[['at', 'eight', 'on', 'thursday', 'morning', 'the', 'great', 'arthur', 'did', 'feel', 'very', 'good'], ['the', 'following', 'morning', 'at', 'nine', 'arthur', 'felt', 'better'], ['a', 'dog', 'run', 'in', 'the', 'street']]


In [24]:
# stop words
words_in_sentence = [[w for w in words if w not in stopwords.words("english")] for words in words_in_sentence]
print(words_in_sentence) 

[['eight', 'thursday', 'morning', 'great', 'arthur', 'feel', 'good'], ['following', 'morning', 'nine', 'arthur', 'felt', 'better'], ['dog', 'run', 'street']]


on peut alors :
 - analyser la frequence d'apparition de chaque terme dans chaque phrase,

In [25]:
freqs_in_sentence = [nltk.FreqDist(s) for s in words_in_sentence]
print(freqs_in_sentence)

[FreqDist({'eight': 1, 'thursday': 1, 'morning': 1, 'great': 1, 'arthur': 1, 'feel': 1, 'good': 1}), FreqDist({'following': 1, 'morning': 1, 'nine': 1, 'arthur': 1, 'felt': 1, 'better': 1}), FreqDist({'dog': 1, 'run': 1, 'street': 1})]


In [None]:
stats sur les mots

# En francais

In [33]:
texte="Bonjour, les gentils étudiants. Comment allez vous ?"

In [34]:
phrases=nltk.sent_tokenize(texte,"french")

In [35]:
print(phrases)

['Bonjour, les gentils étudiants.', 'Comment allez vous ?']


In [36]:
mots=nltk.word_tokenize(texte,"french")
print(mots)

['Bonjour', ',', 'les', 'gentils', 'étudiants', '.', 'Comment', 'allez', 'vous', '?']


In [37]:
mots= [w for w in mots if w.isalnum()] # garde seulement if contain alphanumeric characters
print(mots)

['Bonjour', 'les', 'gentils', 'étudiants', 'Comment', 'allez', 'vous']


In [38]:
mots = [w.lower() for w in mots]
print(mots)

['bonjour', 'les', 'gentils', 'étudiants', 'comment', 'allez', 'vous']


In [39]:
from nltk.corpus import stopwords
mots = [w for w in mots if w not in stopwords.words("french")]
print(mots)

['bonjour', 'gentils', 'étudiants', 'comment', 'allez']


In [40]:
#racine
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('french')

stemmed = [stemmer.stem(w) for w in mots]
print(stemmed)

['bonjour', 'gentil', 'étudi', 'comment', 'allez']


Actuellement, pas de tag en francais dans NLTK, mais il y en a un ici:
https://nlp.stanford.edu/software/tagger.shtml

# 2ème 45 minutes apprentissage non supervisé et supervisé

In [None]:
vrai ensemble de textes

# Data mining

In [26]:
#pip install wordcloud

In [27]:
#from wordcloud import WordCloud

In [28]:
#from wordcloud import WordCloud

# Create and generate a word cloud image:
#wordcloud = WordCloud().generate_from_frequencies(freqs_in_sentence)
# Display the generated image:
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()

In [29]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()

In [30]:
te_ary = te.fit(words_in_sentence).transform(words_in_sentence)
df = pd.DataFrame(te_ary, columns=te.columns_)

print(te_ary.shape)
te_ary.astype("int")

(3, 14)


array([[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]])

In [31]:
# tf_idf

on peut alors :
 
 - évaluer la similitude des phrases,
 - faire des regroupements de textes,
 - faire de la prédiction de contenu, de mots clefs, de thèmes ...

# non suppervisé

## distance entre mots

## distance entre phrases

## classifier des textes

## regles d association

# Supervisé

## classification de texte

## Sentiment analysis

In [51]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [52]:
sia.polarity_scores("Wow, NLTK is REALLY powerful!")

{'neg': 0.0, 'neu': 0.275, 'pos': 0.725, 'compound': 0.8367}

In [43]:
sia.polarity_scores("NLTK is not bad!")

{'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'compound': 0.484}

In [44]:
sia.polarity_scores("NLTK is bad!")

{'neg': 0.655, 'neu': 0.345, 'pos': 0.0, 'compound': -0.5848}

In [45]:
sia.polarity_scores("NLTK is AWFUL!")

{'neg': 0.668, 'neu': 0.332, 'pos': 0.0, 'compound': -0.6155}

### Structure d'une phrase (retirer pour le séminaire ?)

In [47]:
sentence = """At eight o'clock, on Thursday morning, the great Arthur didn't feel VERY good."""

from nltk import pos_tag, word_tokenize, RegexpParser

# Find all parts of speech in above sentence
tagged = pos_tag(word_tokenize(sentence))

grammar = r"""
  NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>} # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
"""

chunker = RegexpParser(grammar)

output = chunker.parse(tagged)

output.draw()