In [2]:
import spacy
import textacy

nlp = spacy.load('en_core_web_sm')

In [3]:
text = """At the start of 2019, free cash flow, our inventory and lease operating expense on a unit basis was initially forecasted to be flat year-over-year. However, we now expect
full year 2019 LOE to be 4% lower than 2018.
At the midpoint of our 2019 DD&A guidance range of $12.65, we were on track to deliver the lowest rate since EOG's transition from a
natural gas company to oil. Our permanent switch to premium drilling continues to transform the company, driving down finding and
development costs, reducing D&A and enabling EOG to deliver double-digit returns throughout commodity price cycles.
Also of note during the third quarter, we entered into long-term gas supply arrangements with Cheniere Energy. Consistent with our
strategy of having flexibility and diversity in marketing our products, these arrangements provide additional markets for offtake and
pricing diversity for up to 440 million BTU per day starting in 2020, with the ultimate goal of maximizing the realized price for our
growing production of low-cost natural gas.
Now I would like to provide some color on the Bakken and other Rockies plays. In the Bakken this quarter, we completed 15 wells with an
average IP-30 of 2,150 barrels of oil per day, 300 barrels of NGLs and 2 million cubic feet a day of natural gas. Our strong well results
reflect the impact of EOG precision targeting and our new completion techniques. The highlight for the quarter was the Clarks Creek 18
well that was completed in the Three Forks with an IP-30 of 3,800 barrels of oil per day. This well is our best well to date in the Bakken,
which along with strong performance from other wells completed this quarter, are a testament to the continued improvements we see
across our entire inventory."""

# Noun phrases using Textacy

### Potential issues:
* Some 1-word ngrams aren't that useful (Fixed by filtering for NOUN only)
* But others like 'inventory' will be lost if limited to bi/tri-grams

In [378]:
#Ryan on Feb 18, 2020
# This approach uses the textacy library to extract noun chunks 
# and cross-ref with a list of bi-grams / tri-grams

doc = nlp(text)

# Extract a list of entities, we will exclude any noun chunks that are already entities
ents = list(textacy.extract.entities(doc))

themes = []
for sent in doc.sents:

    # Extract a list of noun chunks, filtering out any leading determiners
    noun_chunks = list(textacy.extract.noun_chunks(sent, drop_determiners=True))
    
    # Extract a list of bi/tri-grams filtering out stop words, punctuation, numbers and noisy parts of speech
    bi_grams = list(textacy.extract.ngrams(sent, 2, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=["NOUN","VERB","ADJ"]))
    tri_grams = list(textacy.extract.ngrams(sent, 3, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=["NOUN","VERB","ADJ"]))
    
    # If chunk is in the bi/tri-gram list and not an entitie, add it to the themes list
    for chunk in noun_chunks:   
        if(chunk in bi_grams or chunk in tri_grams and chunk not in ents):
            themes.append(chunk)
        
        # For one word chunks, ensure that word is a noun and not an entitiy
        if(len(chunk) == 1 and chunk.root.pos_ == "NOUN" and chunk not in ents):
            themes.append(chunk)
                
for theme in themes: print(theme)

print("Total themes:", len(themes))

free cash flow
operating expense
unit basis
lowest rate
premium drilling
development costs
commodity price cycles
additional markets
pricing diversity
ultimate goal
realized price
natural gas
strong performance
continued improvements
Total themes: 14


# N-grams using Textacy

### Potential Issues:
* Limited to bigrams, so 'free cash flow' gets split into 'free cash' and 'cash flow'

In [225]:
doc = nlp(text)

bi_grams = list(textacy.extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=["NOUN","VERB","ADJ"]))
tri_grams = list(textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=True, include_pos=["NOUN","VERB","ADJ"]))

ngrams = bi_grams + tri_grams 

for gram in ngrams[:10]:
    print(gram)

lease operating
operating expense
unit basis
flat year
guidance range
lowest rate
natural gas
gas company
permanent switch
premium drilling
51


# Key Terms using Textacy

In [288]:
from textacy import ke

doc = nlp(text)

terms = list(ke.yake(doc, ngrams=1, topn=10, include_pos=["NOUN"]))

for term in terms:
    print(term)

('gas', 0.34029095892656264)
('quarter', 0.34894857737527923)
('year', 0.35748024872718487)
('day', 0.3598592519105591)
('oil', 0.39596369193405345)
('barrel', 0.41454674227485694)
('cost', 0.5229074238363266)
('company', 0.524398786307582)
('start', 0.5306848412269063)
('price', 0.5359993208856335)


# NER using Textacy

In [240]:
doc = nlp(text)

entities = list(textacy.extract.entities(doc, drop_determiners=True, min_freq=0))

for ent in entities:
    print(ent.text, "(", ent.label_, ")")
    

start of 2019 ( DATE )
year-over-year ( DATE )
LOE ( ORG )
4% ( PERCENT )
2018 ( DATE )
2019 ( DATE )
DD&A ( ORG )
12.65 ( MONEY )
EOG ( ORG )
D&A ( ORG )
EOG ( ORG )
third quarter ( DATE )
Cheniere Energy. ( ORG )
440 million ( MONEY )
2020 ( DATE )
Bakken ( ORG )
Rockies ( ORG )
Bakken ( ORG )
quarter ( DATE )
15 ( CARDINAL )
2,150 barrels ( QUANTITY )
300 barrels ( QUANTITY )
2 million cubic feet ( QUANTITY )
EOG ( ORG )
quarter ( DATE )
Clarks ( NORP )
18
 ( DATE )
Three Forks with an IP-30 ( ORG )
3,800 barrels ( QUANTITY )
Bakken ( GPE )
quarter ( DATE )


# NER+ using Textacy

### We can extract entities from the text using our language model and classify those entities into 18 different types including:
* PERSON	People, including fictional.
* NORP	Nationalities or religious or political groups.
* FAC	Buildings, airports, highways, bridges, etc.
* ORG	Companies, agencies, institutions, etc.
* GPE	Countries, cities, states.
* LOC	Non-GPE locations, mountain ranges, bodies of water.
* PRODUCT	Objects, vehicles, foods, etc. (Not services.)
* EVENT	Named hurricanes, battles, wars, sports events, etc.
* WORK_OF_ART	Titles of books, songs, etc.
* LAW	Named documents made into laws.
* LANGUAGE	Any named language.
* **DATE	Absolute or relative dates or periods.**
* TIME	Times smaller than a day.
* PERCENT	Percentage, including ”%“.
* **MONEY	Monetary values, including unit.**
* **QUANTITY	Measurements, as of weight or distance.**
* ORDINAL	“first”, “second”, etc.
* CARDINAL	Numerals that do not fall under another type.

In [373]:
# EXTRACTING DATES FROM THE TEXT

doc = nlp(text)

dates = list(textacy.extract.entities(doc, drop_determiners=True, min_freq=0, include_types=["DATE"]))

print("EXTRACTED ENTITIES:")
for date in dates[:6]:
    print(" ", date.text,"("+date.label_+")")

EXTRACTED ENTITIES:
  start of 2019 (DATE)
  year-over-year (DATE)
  2018 (DATE)
  2019 (DATE)
  third quarter (DATE)
  2020 (DATE)


In [374]:
# EXTRACTING MONEY VALUES FROM THE TEXT

doc = nlp(text)

monies = list(textacy.extract.entities(doc, drop_determiners=True, min_freq=0, include_types=["MONEY"]))

print("EXTRACTED ENTITIES:")
for money in monies:
    print(" ", money.text,"("+money.label_+")")

EXTRACTED ENTITIES:
  12.65 (MONEY)
  440 million (MONEY)


In [4]:
# EXTRACTING QUANTITIES FROM THE TEXT

doc = nlp(text)

figures = list(textacy.extract.entities(doc, drop_determiners=True, min_freq=0, include_types=["NUMERIC"]))

print("EXTRACTED ENTITIES:")
for figure in figures:
    print(" ", figure.text,"("+figure.label_+")")

EXTRACTED ENTITIES:
  start of 2019 (DATE)
  year-over-year (DATE)
  4% (PERCENT)
  2018 (DATE)
  2019 (DATE)
  12.65 (MONEY)
  third quarter (DATE)
  440 million (MONEY)
  2020 (DATE)
  quarter (DATE)
  15 (CARDINAL)
  2,150 barrels (QUANTITY)
  300 barrels (QUANTITY)
  2 million cubic feet (QUANTITY)
  quarter (DATE)
  18
 (DATE)
  3,800 barrels (QUANTITY)
  quarter (DATE)
