# Atelier Python Session 6 - NLP

Aujourd'hui, nous allons aborder le [NLP](https://en.wikipedia.org/wiki/Natural_language_processing) (Natural Language Processing), ou TAL en français. Il s'agit d'un ensemble de techniques qui nous permettront d'extraire des informations du contenu textuel.

In [64]:
import os

def read_txt(path):
  with open(path) as f:
    return f.read().replace("\n", " ")

text = read_txt(os.path.join(os.getcwd(), "sample_data", "test.txt"))

# Tokenization

In [13]:
import spacy

In [42]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

words = []
sentences = []

for token in doc:
  words.append(token.text)

for sent in doc.sents:
  sentences.append(sent.text)

print(words)
print(sentences)

['Edith', 'Roosevelt', '(', '1861–1948', ';', 'née', 'Carow', ')', 'was', 'the', 'second', 'wife', 'of', 'President', 'Theodore', 'Roosevelt', 'and', 'the', 'first', 'lady', 'of', 'the', 'United', 'States', 'from', '1901', 'to', '1909', '.', 'She', 'grew', 'up', 'alongside', 'the', 'Roosevelt', 'family', ',', 'and', 'married', 'Theodore', 'Roosevelt', 'in', '1886', ',', 'having', 'five', 'children', '.', 'She', 'became', 'a', 'public', 'figure', 'when', 'her', 'husband', 'became', 'a', 'war', 'hero', 'in', 'the', 'Spanish', '–', 'American', 'War', 'and', 'was', 'elected', 'governor', 'of', 'New', 'York', '.', 'Theodore', 'became', 'vice', 'president', 'in', 'March', '1901', ',', 'and', 'president', 'after', 'the', 'assassination', 'of', 'William', 'McKinley', 'in', 'September', '.', 'Edith', 'controlled', 'when', 'and', 'how', 'the', 'press', 'reported', 'on', 'the', 'Roosevelts', ',', 'and', 'regulated', 'Washington', 'social', 'life', ',', 'organizing', 'weekly', 'meetings', 'of', 't

## Lemmatization

In [43]:
lemmas = []

for token in doc:
  if token.is_stop == False and token.is_alpha:
    lemmas.append({"original":token.text, "lemma": token.lemma_})

print(lemmas)

[{'original': 'Edith', 'lemma': 'Edith'}, {'original': 'Roosevelt', 'lemma': 'Roosevelt'}, {'original': 'née', 'lemma': 'née'}, {'original': 'Carow', 'lemma': 'Carow'}, {'original': 'second', 'lemma': 'second'}, {'original': 'wife', 'lemma': 'wife'}, {'original': 'President', 'lemma': 'President'}, {'original': 'Theodore', 'lemma': 'Theodore'}, {'original': 'Roosevelt', 'lemma': 'Roosevelt'}, {'original': 'lady', 'lemma': 'lady'}, {'original': 'United', 'lemma': 'United'}, {'original': 'States', 'lemma': 'States'}, {'original': 'grew', 'lemma': 'grow'}, {'original': 'alongside', 'lemma': 'alongside'}, {'original': 'Roosevelt', 'lemma': 'Roosevelt'}, {'original': 'family', 'lemma': 'family'}, {'original': 'married', 'lemma': 'marry'}, {'original': 'Theodore', 'lemma': 'Theodore'}, {'original': 'Roosevelt', 'lemma': 'Roosevelt'}, {'original': 'having', 'lemma': 'have'}, {'original': 'children', 'lemma': 'child'}, {'original': 'public', 'lemma': 'public'}, {'original': 'figure', 'lemma': 

## POS

In [44]:
pos_tags = []
for token in doc:
  if token.is_stop == False and token.is_alpha:
    pos_tags.append({"original":token.text, "pos" : token.pos_, "tag" : token.tag_})

print(pos_tags)

[{'original': 'Edith', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'Roosevelt', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'née', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'Carow', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'second', 'pos': 'ADJ', 'tag': 'JJ'}, {'original': 'wife', 'pos': 'NOUN', 'tag': 'NN'}, {'original': 'President', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'Theodore', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'Roosevelt', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'lady', 'pos': 'NOUN', 'tag': 'NN'}, {'original': 'United', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'States', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'grew', 'pos': 'VERB', 'tag': 'VBD'}, {'original': 'alongside', 'pos': 'ADP', 'tag': 'IN'}, {'original': 'Roosevelt', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'family', 'pos': 'NOUN', 'tag': 'NN'}, {'original': 'married', 'pos': 'VERB', 'tag': 'VBD'}, {'original': 'Theodore', 'pos': 'PROPN', 'tag': 'NNP'}, {'original': 'Roosevelt', 'pos':

## NER

In [45]:
ents = []
for ent in doc.ents:
  ents.append({"word" : ent.text, "label": ent.label_})

print(ents)

[{'word': 'Edith Roosevelt', 'label': 'PERSON'}, {'word': '1861–1948', 'label': 'DATE'}, {'word': 'Carow', 'label': 'PRODUCT'}, {'word': 'second', 'label': 'ORDINAL'}, {'word': 'Theodore Roosevelt', 'label': 'PERSON'}, {'word': 'first', 'label': 'ORDINAL'}, {'word': 'the United States', 'label': 'GPE'}, {'word': '1901 to 1909', 'label': 'DATE'}, {'word': 'Roosevelt', 'label': 'PERSON'}, {'word': 'Theodore Roosevelt', 'label': 'PERSON'}, {'word': '1886', 'label': 'DATE'}, {'word': 'five', 'label': 'CARDINAL'}, {'word': 'Spanish', 'label': 'NORP'}, {'word': 'New York', 'label': 'GPE'}, {'word': 'March 1901', 'label': 'DATE'}, {'word': 'William McKinley', 'label': 'PERSON'}, {'word': 'September', 'label': 'DATE'}, {'word': 'Edith', 'label': 'PERSON'}, {'word': 'Roosevelts', 'label': 'WORK_OF_ART'}, {'word': 'Washington', 'label': 'GPE'}, {'word': 'weekly', 'label': 'DATE'}, {'word': '1902', 'label': 'DATE'}, {'word': 'White House', 'label': 'ORG'}, {'word': 'first', 'label': 'ORDINAL'}, {

## Sentiment analysis

In [61]:
from textblob import TextBlob

['Edith', 'Roosevelt', 'née', 'Carow', 'second', 'wife', 'President', 'Theodore', 'Roosevelt', 'lady', 'United', 'States', 'grew', 'alongside', 'Roosevelt', 'family', 'married', 'Theodore', 'Roosevelt', 'having', 'children', 'public', 'figure', 'husband', 'war', 'hero', 'Spanish', 'American', 'War', 'elected', 'governor', 'New', 'York', 'Theodore', 'vice', 'president', 'March', 'president', 'assassination', 'William', 'McKinley', 'September', 'Edith', 'controlled', 'press', 'reported', 'Roosevelts', 'regulated', 'Washington', 'social', 'life', 'organizing', 'weekly', 'meetings', 'cabinet', 'members', 'wives', 'gatekeeper', 'attend', 'formal', 'events', 'oversight', 'White', 'House', 'renovations', 'hiring', 'social', 'secretary', 'lady', 'Belle', 'Hagner', 'considered', 'enduring', 'legacies', 'remained', 'politically', 'active', 'despite', 'poor', 'health']


In [73]:
blob = TextBlob(text)
print("Sentiment Polarity:", blob.sentiment.polarity) # Range: [-1, 1]

Sentiment Polarity: 0.08613053613053613


In [78]:
# Polarity (-1 to 1), Subjectivity (0 to 1)
polarity, subjectivity = blob.sentiment
print(f"Polarity: {polarity}, Subjectivity: {subjectivity}")

Polarity: 0.08613053613053613, Subjectivity: 0.3041958041958043


## Topic modelling

In [46]:
from gensim import corpora
from gensim.models import LdaModel

In [66]:
tokens_filtered = []
for token in doc:
  if token.is_stop == False and token.is_alpha:
    tokens_filtered.append(token.text)

print(tokens_filtered)

['Edith', 'Roosevelt', 'née', 'Carow', 'second', 'wife', 'President', 'Theodore', 'Roosevelt', 'lady', 'United', 'States', 'grew', 'alongside', 'Roosevelt', 'family', 'married', 'Theodore', 'Roosevelt', 'having', 'children', 'public', 'figure', 'husband', 'war', 'hero', 'Spanish', 'American', 'War', 'elected', 'governor', 'New', 'York', 'Theodore', 'vice', 'president', 'March', 'president', 'assassination', 'William', 'McKinley', 'September', 'Edith', 'controlled', 'press', 'reported', 'Roosevelts', 'regulated', 'Washington', 'social', 'life', 'organizing', 'weekly', 'meetings', 'cabinet', 'members', 'wives', 'gatekeeper', 'attend', 'formal', 'events', 'oversight', 'White', 'House', 'renovations', 'hiring', 'social', 'secretary', 'lady', 'Belle', 'Hagner', 'considered', 'enduring', 'legacies', 'remained', 'politically', 'active', 'despite', 'poor', 'health']


In [67]:
dictionary = corpora.Dictionary([tokens_filtered])

corpus = []
for item in [tokens_filtered]:
  corpus.append(dictionary.doc2bow(item))

In [68]:
# LDA Model
lda_model = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10)

# Display Topics
topics = lda_model.print_topics()
print("Topics:")
for topic in topics:
    print(topic)

Topics:
(0, '0.015*"Roosevelt" + 0.014*"Theodore" + 0.014*"Edith" + 0.014*"president" + 0.014*"lady" + 0.014*"social" + 0.014*"meetings" + 0.014*"alongside" + 0.014*"politically" + 0.014*"public"')
(1, '0.039*"Roosevelt" + 0.030*"Theodore" + 0.022*"social" + 0.022*"lady" + 0.022*"president" + 0.022*"Edith" + 0.013*"Washington" + 0.013*"cabinet" + 0.013*"poor" + 0.013*"Hagner"')


In [69]:
num_topics = lda_model.num_topics
num_words = 10

topics = []
for topic_id in range(num_topics):
    words_probs = lda_model.show_topic(topic_id, topn=num_words)
    topics.append(words_probs)

print(topics)

[[('Roosevelt', 0.014721414), ('Theodore', 0.014415038), ('Edith', 0.014213961), ('president', 0.014197828), ('lady', 0.014182164), ('social', 0.014154064), ('meetings', 0.01413088), ('alongside', 0.014119037), ('politically', 0.014118592), ('public', 0.014111702)], [('Roosevelt', 0.038886614), ('Theodore', 0.0302811), ('social', 0.021661336), ('lady', 0.02165251), ('president', 0.021647595), ('Edith', 0.021642528), ('Washington', 0.013004153), ('cabinet', 0.013002196), ('poor', 0.013000809), ('Hagner', 0.012999)]]
