In [3]:
# Import Counter
from collections import Counter
from nltk.tokenize import word_tokenize

In [1]:
# Import data
f= open('articles.txt','r')
articles = f.read()
f.close()

In [4]:
# Tokenize the article: tokens
tokens = word_tokenize(articles)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]

In [4]:
# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
print(bow_simple.most_common(10))

[('the', 274), (',', 269), ('.', 188), ('to', 131), ('of', 119), ('a', 100), ('in', 99), ('and', 80), ('that', 67), ('’', 54)]


# Text preprocessing practice

In [5]:
f= open('english_stopwords.txt','r')
english_stops = f.read()
f.close()

In [6]:
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]

In [7]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer =WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t)  for t in no_stops]

In [8]:
# Create the bag-of-words: bow
bow =Counter(lemmatized)

In [9]:
# Print the 10 most common tokens
print(bow.most_common(10))

[('said', 29), ('robot', 28), ('population', 22), ('news', 19), ('human', 16), ('growth', 16), ('fake', 15), ('country', 14), ('united', 14), ('machine', 13)]


# named-entity recognition with NLTK

In [10]:
import nltk 
from nltk.tokenize import word_tokenize,sent_tokenize

# Tokenize the article into sentences: sentences
sentences = nltk.sent_tokenize(articles)

# Tokenize each sentence into words: token_sentences
token_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [11]:
# Tag each tokenized sentence into parts of speech: pos_sentences
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences] 

# Create the named entity chunks: chunked_sentences
chunked_sentences = nltk.ne_chunk_sents(pos_sentences,binary=True)

# Test for stems of the tree with 'NE' tags
for sent in chunked_sentences:
    for chunk in sent:
        if hasattr(chunk, "label") and chunk.label() == "NE" :
            print(chunk)

(NE Image/NN)
(NE EPA/NNP Image/NNP)
(NE US/JJ)
(NE Reuters/NNPS)
(NE Portland/NNP)
(NE Oregon/NNP)
(NE Greyball/NNP)
(NE Portland/NNP)
(NE Uber/NNP)
(NE Uber/NNP)
(NE New/NNP York/NNP Times/NNP)
(NE Uber/NNP)
(NE Alexa/NNP)
(NE Siri/NNP)
(NE Rachael/NNP Tatman/NNP)
(NE South/NNP)
(NE New/NNP Zealand/NNP)
(NE Tatman/NNP)
(NE San/NNP Francisco/NNP)
(NE University/NNP)
(NE Washington/NNP)
(NE Virtual/NNP Assistant/NNP Summit/NNP)
(NE Park/NNP Central/NNP Hotel/NNP)
(NE Apple/NNP)
(NE Siri/NNP)
(NE Amazon/NNP)
(NE Echo/NNP)
(NE Tatman/NNP)
(NE Tatman/NNP)
(NE YouTube/NNP)
(NE Southern/JJ)
(NE California/NNP)
(NE South/NNP)
(NE United/NNP States/NNPS)
(NE Southern/NNP)
(NE Stephen/NNP Scarr/NNP)
(NE CEO/NNP)
(NE eContext/NN)
(NE YouTube/NNP)
(NE Amazon/NNP)
(NE Alexa/NNP)
(NE Alonso/NNP Martinez/NNP)
(NE Pixar/NNP Animation/NNP Studios/NNP)
(NE Martinez/NNP)
(NE Emeryville/NNP)
(NE Elena/NNP Corina/NNP Grigore/NNP)
(NE Yale/NNP University/NNP)
(NE Social/NNP Robotics/NNP Lab/NNP)
(NE Grigo

# Charting practice

In [12]:
from collections import defaultdict            
from matplotlib import pyplot as plt 
%matplotlib notebook

In [13]:
# Create the named entity chunks: chunked_sentences
chunked_sentences1 = nltk.ne_chunk_sents(pos_sentences)

In [14]:
# Create the defaultdict: ner_categories
ner_categories = defaultdict(int)
# Create the nested for loop
for sent in chunked_sentences1:
    for chunk in sent:
        if hasattr(chunk, 'label'):
           ner_categories[chunk.label()]+= 1

In [15]:
# Create a list from the dictionary keys for the chart labels: labels
labels = list(ner_categories.keys())

In [16]:
# Create a list of the values: values
values = [ner_categories.get(l) for l in labels]

In [17]:
# Create the pie chart
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=140)

# Display the chart
plt.show()

<IPython.core.display.Javascript object>

# Spacy NER

In [2]:
# Import spacy
import spacy

# Instantiate the English model: nlp
nlp = spacy.load('en',tagger=False, parser=False, matcher=False)

# Create a new document: doc
doc = nlp(articles)

# Print all of the found entities and their labels
for ent in doc.ents:
    print(ent.label_, ent.text)


ORG EPA
ORG Uber
GPE 

ORG Uber
GPE US
GPE 

GPE 

GPE 

ORG Uber
ORG Reuters
GPE 

GPE Portland
GPE Oregon
GPE 

PERSON Bookings
GPE 

GPE 

DATE last week
GPE Portland
ORG Uber
DATE April 2015
ORG Uber's
ORG the New York Times
DATE earlier this year
ORG Uber
GPE 

ORG Uber
GPE 

ORG Uber
ORG Google
ORG Waymo
ORG Alexa
PERSON Siri
PERSON Rachael Tatman
GPE 

LOC South
GPE New Zealand
ORG Tatman
DATE last week
GPE San Francisco
GPE 

GPE Twitter
EVENT ”

ORG the University of Washington’s
DATE two-day
PERSON Summit
DATE Friday
FAC the Park Central Hotel
WORK_OF_ART Deep Learning Summit
CARDINAL about 600
GPE 

ORG Apple
ORG Amazon
GPE Echo
GPE 

CARDINAL as little as two
GPE 

NORP Tatman
ORG YouTube
NORP Southern
GPE California
GPE 

LOC South
GPE the United States
NORP Southern
GPE 

PERSON Stephen Scarr
ORG Info.com
ORG eContext
GPE 

PERCENT 20 percent
CARDINAL 1
GPE Scarr
GPE 

NORP YouTube
ORG Amazon
ORG Alexa
GPE 

PERSON Martinez
ORG Pixar Animation Studios
GPE 

PERSON Martine