# Introduction to Natural Language Processing in Python

## Chapter 2

In [42]:
import glob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import itertools

In [43]:
text = "The cat is in the box. The cat box."

Counter(word_tokenize(text))

Counter({'The': 2, 'cat': 2, 'is': 1, 'in': 1, 'the': 1, 'box': 2, '.': 2})

In [44]:
with open("wiki_text_debugging.txt", "r") as f_article:
    article = f_article.read()
    
# print(article)

In [45]:
# Tokenize the article: tokens
tokens = word_tokenize(article)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
print(bow_simple.most_common(10))

[(',', 151), ('the', 150), ('.', 89), ('of', 81), ("''", 69), ('to', 63), ('a', 60), ('``', 47), ('in', 44), ('and', 41)]


In [46]:
with open("english_stopwords.txt", "r") as f_english:
    text_english = f_english.read()
    
english_stops = text_english.split("\n")

In [47]:
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[('debugging', 39), ('system', 25), ('bug', 17), ('software', 16), ('problem', 15), ('tool', 15), ('computer', 14), ('process', 13), ('term', 13), ('debugger', 13)]


In [48]:
# Get all tokens for all Wiki Articles
list_wiki_files = glob.glob("wiki*.txt")
print(list_wiki_files)

wiki_all = []

for wiki_file in list_wiki_files:
    print(wiki_file)
    with open(wiki_file, encoding="utf8") as f_wiki:        

        wiki_article = f_wiki.read()
        
        # Tokenize the article: tokens
    wiki_tokens = word_tokenize(wiki_article)

    # Convert the tokens into lowercase: lower_tokens
    wiki_lower_tokens = [t.lower() for t in wiki_tokens]

    # Retain alphabetic words: alpha_only
    wiki_alpha_only = [t for t in wiki_lower_tokens if t.isalpha()]

    # Remove all stop words: no_stops
    wiki_no_stops = [t for t in wiki_alpha_only if t not in english_stops]

    # Instantiate the WordNetLemmatizer
    wiki_wordnet_lemmatizer = WordNetLemmatizer()

    # Lemmatize all tokens into a new list: lemmatized
    wiki_lemmatized = [wiki_wordnet_lemmatizer.lemmatize(t) for t in wiki_no_stops]
    
    wiki_all.append(wiki_lemmatized)

print(len(wiki_all))
articles = wiki_all

['wiki_text_bug.txt', 'wiki_text_computer.txt', 'wiki_text_crash.txt', 'wiki_text_debugger.txt', 'wiki_text_debugging.txt', 'wiki_text_exception.txt', 'wiki_text_hopper.txt', 'wiki_text_language.txt', 'wiki_text_malware.txt', 'wiki_text_program.txt', 'wiki_text_reversing.txt', 'wiki_text_software.txt']
wiki_text_bug.txt
wiki_text_computer.txt
wiki_text_crash.txt
wiki_text_debugger.txt
wiki_text_debugging.txt
wiki_text_exception.txt
wiki_text_hopper.txt
wiki_text_language.txt
wiki_text_malware.txt
wiki_text_program.txt
wiki_text_reversing.txt
wiki_text_software.txt
12


In [49]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

computer
[(1, 1), (13, 1), (14, 1), (17, 1), (24, 1), (27, 1), (33, 1), (34, 4), (42, 2), (43, 7)]


In [50]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

debugging 39
system 25
bug 17
software 16
problem 15


In [51]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count
    
# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

debugging 39
system 25
bug 17
software 16
problem 15
computer 753
software 451
program 341
cite 322
language 320


In [52]:
from gensim.models.tfidfmodel import TfidfModel

In [53]:
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

[(1, 0.012844137985779271), (13, 0.012844137985779271), (14, 0.012844137985779271), (17, 0.012844137985779271), (24, 0.02035747706154831)]
wolf 0.23022876516553425
debugging 0.20790115749039909
fence 0.1841830121324274
squeeze 0.13813725909932056
tron 0.13813725909932056
