### Building a Counter with bag-of-words


In [1]:
from nltk.tokenize import word_tokenize
# Import Counter
from collections import Counter

In [2]:
f = open('articles.txt', 'r', encoding='utf-8')
article = f.read()
print(article)

﻿Image copyright EPA Image caption Uber has been criticised many times over the way it runs its business


Ride-sharing firm Uber is facing a criminal investigation by the US government.


The scrutiny has started because the firm is accused of using "secret" software that let it operate in regions where it was banned or restricted.


The software, called "greyball", helped it identify officials seeking to stop the service running.


A spokesman for Uber declined to comment on the investigation, reported the Reuters news agency.


It is claimed greyball was used in several areas, including Portland, Oregon, where the ride service was still seeking official approval to operate.


Bookings blocked


In those areas, transport regulation officials posed as passengers in a bid to prove that the company was operating illegally. Greyball worked out who the officials were and blocked them from booking rides with the company's drivers.


In a letter sent last week to transport regulators in Por

In [3]:
# Tokenize the article: tokens
tokens = word_tokenize(article)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
print(bow_simple.most_common(10))

[('the', 274), (',', 269), ('.', 189), ('to', 131), ('of', 119), ('a', 100), ('in', 99), ('and', 80), ('that', 67), ('’', 54)]


### Text preprocessing practice


In [4]:
f = open('english_stopwords.txt', 'r', encoding='utf-8')
english_stops = f.read()
print(english_stops)

i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now
d
ll
m
o
re
ve
y
ain
aren
couldn
didn
doesn
hadn
hasn
haven
isn
ma
mightn
mustn
needn
shan
shouldn
wasn
weren
won
wouldn


In [5]:
import nltk
nltk.download('wordnet')
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sclau\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sclau\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[('said', 29), ('robot', 28), ('population', 22), ('news', 19), ('human', 16), ('growth', 16), ('fake', 15), ('country', 14), ('united', 14), ('machine', 13)]


### Creating and querying a corpus with gensim

In [46]:
f = open('wiki_text_computer.txt', 'r', encoding='utf-8')
articles = f.read()
print(articles)

{{redirect|Computer system||Computer (disambiguation)|and|Computer system (disambiguation)}}
{{pp-semi-indef}}
{{pp-move-indef}}
{{Infobox|title = Computer
|image = <div style="white-space:nowrap;">
File:Acer Aspire 8920 Gemstone by Georgy.JPG|x81pxFile:Columbia Supercomputer - NASA Advanced Supercomputing Facility.jpg|x81pxFile:Intertec Superbrain.jpg|x81px<br />File:2010-01-26-technikkrempel-by-RalfR-05.jpg|x79pxFile:Thinking Machines Connection Machine CM-5 Frostburg 2.jpg|x79pxFile:G5 supplying Wikipedia via Gigabit at the Lange Nacht der Wissenschaften 2006 in Dresden.JPG|x79px<br />File:DM IBM S360.jpg|x77pxFile:Acorn BBC Master Series Microcomputer.jpg|x77pxFile:Dell PowerEdge Servers.jpg|x77px 
|caption = Computers and computing devices from different eras}}

<!-- This paragraph is currently under discussion in talk. -->A '''computer''' is a device that can be Computer programming|instructed to carry out an arbitrary set of arithmetic or Boolean algebra|logical operations autom

In [47]:
# Tokenize the article: tokens
tokens = word_tokenize(articles)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

In [48]:
# Remove all stop words: no_stops
articles = [t for t in lower_tokens if t not in english_stops]
print(articles)



f = open('wiki_articles.txt', 'r', encoding='utf-8')
articles = f.read()
print(articles)

In [49]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary 

In [50]:
# Create a Dictionary from the articles: dictionary
articles = [d.split() for d in articles]
dictionary = Dictionary(articles)

In [56]:
print(dictionary)

Dictionary(3452 unique tokens: ['{', 'redirect|computer', 'system||computer', '(', 'disambiguation']...)


In [51]:
# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")
computer_id

13

In [52]:
# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

computer


In [53]:
# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]
corpus

[[(0, 1)],
 [(0, 1)],
 [(1, 1)],
 [(2, 1)],
 [(3, 1)],
 [(4, 1)],
 [(5, 1)],
 [(6, 1)],
 [(7, 1)],
 [(3, 1)],
 [(4, 1)],
 [(5, 1)],
 [(8, 1)],
 [(8, 1)],
 [(0, 1)],
 [(0, 1)],
 [(9, 1)],
 [(8, 1)],
 [(8, 1)],
 [(0, 1)],
 [(0, 1)],
 [(10, 1)],
 [(8, 1)],
 [(8, 1)],
 [(0, 1)],
 [(0, 1)],
 [(11, 1)],
 [(12, 1)],
 [(13, 1)],
 [(14, 1)],
 [(12, 1)],
 [(15, 1)],
 [(16, 1)],
 [(17, 1)],
 [(18, 1)],
 [(19, 1)],
 [(20, 1)],
 [(21, 1)],
 [(22, 1)],
 [(18, 1)],
 [(23, 1)],
 [(24, 1)],
 [(20, 1)],
 [(25, 1)],
 [(26, 1)],
 [(27, 1)],
 [(28, 1)],
 [(29, 1)],
 [(20, 1)],
 [(30, 1)],
 [(31, 1)],
 [(32, 1)],
 [(33, 1)],
 [(34, 1)],
 [(35, 1)],
 [(36, 1)],
 [(20, 1)],
 [(37, 1)],
 [(38, 1)],
 [(15, 1)],
 [(39, 1)],
 [(40, 1)],
 [(23, 1)],
 [(41, 1)],
 [(20, 1)],
 [(42, 1)],
 [(43, 1)],
 [(44, 1)],
 [(45, 1)],
 [(46, 1)],
 [(47, 1)],
 [(48, 1)],
 [(20, 1)],
 [(49, 1)],
 [(50, 1)],
 [(51, 1)],
 [(52, 1)],
 [(53, 1)],
 [(54, 1)],
 [(55, 1)],
 [(56, 1)],
 [(57, 1)],
 [(58, 1)],
 [(15, 1)],
 [(39, 1)],
 [(40

In [54]:
# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

[(3, 1)]


### Gensim bag-of-words

In [64]:
from collections import defaultdict
import itertools

In [65]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

( 1


In [None]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count
    
# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

### Tf-idf with Wikipedia

In [None]:
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

In [1]:
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

NameError: name 'TfidfModel' is not defined