## TP2: Gensim – Vectorizing Text and Transformations and n-grams

In [53]:
import nltk
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
import spacy
import gensim
from gensim import corpora
from gensim import models
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import reuters
nlp = spacy.load("en_core_web_sm")
corpus = reuters.sents(categories=["cpu"])

In [55]:
documents = [" ".join(sent) for sent in corpus]

texts = []

for i, document in enumerate(documents):
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num:
            text.append(w.lemma_)
    texts.append(text)

In [56]:
# maps words to unique integer IDs
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)


{'AUGUST': 0, 'CAPACITY': 1, 'INDUSTRIAL': 2, 'PCT': 3, 'RATE': 4, 'SEPTEMBER': 5, 'UNCHANGED': 6, 'USE': 7, 's': 8, 'u': 9, 'CANADA': 10, 'Canada': 11, 'MANUFACTURING': 12, 'Statistics': 13, 'UTILIZATION': 14, 'canadian': 15, 'capacity': 16, 'manufacturing': 17, 'pct': 18, 'quarter': 19, 'rate': 20, 'rise': 21, 'say': 22, 'utilization': 23, 'agency': 24, 'change': 25, 'federal': 26, 'mark': 27, 'small': 28, 'building': 29, 'construction': 30, 'increase': 31, 'industry': 32, 'lead': 33, 'material': 34, 'metallic': 35, 'mineral': 36, 'non': 37, 'residential': 38, 'sector': 39, 'strong': 40, 'FEB': 41, 'JAN': 42, 'ROSE': 43, 'Board': 44, 'December': 45, 'FEBRUARY': 46, 'February': 47, 'Federal': 48, 'January': 49, 'U': 50, 'compare': 51, 'factory': 52, 'mine': 53, 'operate': 54, 'reserve': 55, 'revise': 56, 'utility': 57, 'Fed': 58, 'previously': 59, 'assembly': 60, 'automobile': 61, 'gain': 62, 'help': 63, 'metal': 64, 'primary': 65, 'production': 66, 'raise': 67, 'surge': 68, 'durable'

In [57]:
# creation de bag of words (id,count)
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 2), (20, 1), (21, 2), (22, 1), (23, 1)], [(17, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)], [(18, 1), (29, 1), (30, 1), (31, 3), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)], [(1, 1), (2, 1), (3, 2), (4, 1), (7, 1), (8, 1), (9, 1), (41, 1), (42, 1), (43, 1)], [(1, 1), (2, 1), (3, 2), (4, 1), (7, 1), (8, 1), (9, 1), (41, 1), (42, 1), (43, 1)], [(1, 1), (3, 1), (4, 1), (7, 1), (8, 2), (9, 1), (16, 1), (18, 2), (22, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)], [(18, 2), (20, 1), (22, 1), (45, 1), (49, 1), (58, 1), (59, 1)], [(16, 1), (17, 1), (18, 2), (47, 1), (49, 1), (60

In [58]:
# TF-IDF
# TF-IDF is a numerical statistic that reflects the importance of a word in a
# document relative to a collection of documents.
tfidf = models.TfidfModel(corpus)
for document in tfidf[corpus]:
    print(document)


[(0, 0.4235485299204986), (1, 0.25500162425090656), (2, 0.29604771779504524), (3, 0.25500162425090656), (4, 0.25500162425090656), (5, 0.4235485299204986), (6, 0.4235485299204986), (7, 0.25500162425090656), (8, 0.2214645238901618), (9, 0.25500162425090656)]
[(0, 0.4235485299204986), (1, 0.25500162425090656), (2, 0.29604771779504524), (3, 0.25500162425090656), (4, 0.25500162425090656), (5, 0.4235485299204986), (6, 0.4235485299204986), (7, 0.25500162425090656), (8, 0.2214645238901618), (9, 0.25500162425090656)]
[(10, 0.3061219610608839), (11, 0.3061219610608839), (12, 0.3061219610608839), (13, 0.3061219610608839), (14, 0.3061219610608839), (15, 0.3061219610608839), (16, 0.12302919028441175), (17, 0.1644620563424265), (18, 0.07289438663373786), (19, 0.47058401740331046), (20, 0.10727714567609761), (21, 0.328924112684853), (22, 0.10727714567609761), (23, 0.19385914264364043)]
[(17, 0.1833368319997831), (19, 0.5245914155470592), (20, 0.11958899500371396), (21, 0.1833368319997831), (22, 0.119

In [59]:
# Add bigram and trigram extraction using Gensim's Phrases
bigram = Phrases(texts, min_count=1, threshold=1)
trigram = Phrases(bigram[texts], min_count=1, threshold=1)

texts = [trigram[bigram[text]] for text in texts]


In [60]:
from prettytable import PrettyTable
# Create a table using PrettyTable
table = PrettyTable()
table.field_names = ["Document", "Original Text", "Bigrams and Trigrams"]

# Print the documents along with their bigrams and trigrams
for i, (doc, text) in enumerate(zip(documents, texts)):
    table.add_row([i + 1, doc, text])
    #print(f"Document {i + 1}:")
    #print("Original Text:")
    #print(doc)
    #print("Bigrams and Trigrams:")
    #print(text)
    #print("\n")
print(table)

+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Document |                                                                                                              Original Text                                                                                                              |                                                                                                Bigrams and Trigrams                                                                                               |
+----------+----------------------------------------------------------------------