In [1]:
# Import the dependencies
import spacy
from nltk.corpus import reuters
from spacy import displacy
from collections import Counter
import pandas as pd

# Load the small English language model for spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
# Analyze a sentence using spacy
doc = nlp(u"""Patrick Mahomes is a quarterback for the Kansas City Chiefs in the American Conference, 
which is one of two conferences in the National Football League.""")

# Access the tagged entities with .text and .label_
for word in doc.ents:
    print(word.text + " --> " + word.label_)

Patrick Mahomes --> PERSON
the Kansas City Chiefs --> ORG
the American Conference --> ORG
one --> CARDINAL
two --> CARDINAL
the National Football League --> ORG


In [5]:
# Get all the categories in the Reuters corpus. 
categories = reuters.categories()
print(categories)

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [8]:
# Locate and store a single article from the Reuters stories with the category "coffee".

article = reuters.raw(reuters.fileids(categories="coffee")[0])

print(article)

INDONESIAN COMMODITY EXCHANGE MAY EXPAND
  The Indonesian Commodity Exchange is
  likely to start trading in at least one new commodity, and
  possibly two, during calendar 1987, exchange chairman Paian
  Nainggolan said.
      He told Reuters in a telephone interview that trading in
  palm oil, sawn timber, pepper or tobacco was being considered.
      Trading in either crude palm oil (CPO) or refined palm oil
  may also be introduced. But he said the question was still
  being considered by Trade Minister Rachmat Saleh and no
  decision on when to go ahead had been made.
      The fledgling exchange currently trades coffee and rubber
  physicals on an open outcry system four days a week.
      "Several factors make us move cautiously," Nainggolan said.
  "We want to move slowly and safely so that we do not make a
  mistake and undermine confidence in the exchange."
      Physical rubber trading was launched in 1985, with coffee
  added in January 1986. Rubber contracts are traded FOB

In [9]:
# Analyze the article with spacy
doc = nlp(article)

# Render NER visualization with displacy to determine entities for extraction
displacy.render(doc, style='ent')

In [10]:
articles = reuters.raw(categories = "coffee")

In [14]:
len(articles)

'I'

In [15]:
# Store all reuters articles with category "coffee".
articles = reuters.raw(categories = "coffee")

# Set articles to be analyzed with spacy
doc = nlp(articles)

In [16]:
# Extract geopolitical "GPE" and organizational entities "ORG" using a list comprehension.
geo_org_entities = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'ORG']]


# Print the first 20 entities.
geo_org_entities[:20]

['INDONESIAN COMMODITY',
 'The Indonesian Commodity Exchange',
 'Reuters',
 'CPO',
 'Trade',
 'Indonesia',
 'Trade',
 'Indonesia',
 'Nainggolan',
 'South Korea',
 'Taiwan',
 'Mexico',
 'Colombia',
 'the Foreign Trade\n  Institute',
 'Colombia',
 'The National Planning Department',
 'Colombia',
 "National Coffee Growers' Federation",
 'New York',
 'Colombia']

In [17]:
# Using a list comprehension convert each entity to lowercase and remove the newline character. 
entities = [i.lower().replace('\n', '') for i in geo_org_entities]

# Print the entities
entities[:20]

['indonesian commodity',
 'the indonesian commodity exchange',
 'reuters',
 'cpo',
 'trade',
 'indonesia',
 'trade',
 'indonesia',
 'nainggolan',
 'south korea',
 'taiwan',
 'mexico',
 'colombia',
 'the foreign trade  institute',
 'colombia',
 'the national planning department',
 'colombia',
 "national coffee growers' federation",
 'new york',
 'colombia']

In [18]:
# Create a variable, most_freq_entities, that stores the most frequent entities 
# using the most_common() function from the Counter module.
entities = Counter(entities).most_common()

# Print the first 10 most frequent entities
entities[:10]

[('brazil', 172),
 ('ico', 125),
 ('u.s.', 84),
 ('colombia', 81),
 ('london', 59),
 ('ibc', 41),
 ('reuters', 32),
 ('indonesia', 28),
 ('india', 25),
 ('uganda', 21)]

In [21]:
# Use list comprehensions to retrieve each entity and the number of occurrences for each entity in separate lists.
entity = [entities[i][0] for i, _ in enumerate(entities)]
frequency = [entities[i][1] for i, _ in enumerate(entities)]

In [22]:
# Create a DataFrame called, adjs_df, that has columns to hold the
# inaugural addresses, the common adjective, and the number of times each adjective appears.


# Sort the DataFrame


# Display the first ten rows. 


common_entities_df = pd.DataFrame(
    {
        'entity':entity,
        'frequency':frequency
    }
)

# Sort the DataFrame
common_entities_df.sort_values(by=['frequency'], ascending=False).reset_index(drop=True)

# Display the first ten rows. 
common_entities_df.head(10)

Unnamed: 0,entity,frequency
0,brazil,172
1,ico,125
2,u.s.,84
3,colombia,81
4,london,59
5,ibc,41
6,reuters,32
7,indonesia,28
8,india,25
9,uganda,21


In [23]:
# Display the last ten rows. 
common_entities_df.tail(10)

Unnamed: 0,entity,frequency
410,the international coffee agreement,1
411,quotas,1
412,the national coffee growers' federation,1
413,back,1
414,syndarma,1
415,resettle,1
416,farms,1
417,the federal statistics office,1
418,attend ico executive board,1
419,dutch coffee roasters' association,1
