# Extracting Information from Text Data Assignment

In [1]:
#!pip install spacy
#!pip install textacy --quiet
#!pip install rake_nltk --quiet
#!conda install -c conda-forge textacy
#!pip install gensim

In [2]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [3]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = 'cnn_lite/'
DOC_PATTERN = r'articles_text.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 14
Number of paragraphs: 14
Number of sentences: 427
Number of words: 13668
Vocabulary: 2927
Avg chars per word: 5.0
Avg words per sentence: 32.0


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [6]:
for doc in docs:
    keys = keywords(doc, words=5, lemmatize=True, split=True)
    print(keys)

['pink', 'carey', 'tour', 'year', 'pretty']
['patrick', 'primary', 'telling', 'democrats', 'cnn']
['narwhal', 'tail', 'puppy', 'unicorn', 'dogs']
['states', 'democratic', 'bloomberg', 'told', 'running']
['republican', 'taylor', 'rep', 'presidents', 'ukraine']
['muslimness', 'people', 'skin', 'white', 'religion']
['trump', 'new', 'said', 'republican', 'media']
['said', 'police', 'brown', 'jones', 'roanoke']
['trump', 'hotels', 'office', 'profit', 'owned']
['keys', 'grammys', 'award', 'alicia', 'power']
['americans', 'republican', 'trump', 'ukrainians', 'investigate']
['student', 'said', 'told', 'pence', 'schools']
['crows', 'disney', 'american', 'old', 'movie']
['protester', 'police', 'chinese', 'new', 'kong']


### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [7]:
r = Rake()
for doc in docs:
    key_phrases = r.extract_keywords_from_text(doc)
    key_phrases = r.get_ranked_phrases_with_scores()
    print(key_phrases[0:3])

[(32.5, 'country music association awards red carpet'), (17.166666666666664, 'school soon ," pink said'), (16.5, 'country star chris stapleton')]
[(94.03333333333333, 'former new york city mayor michael bloomberg stepped forward last week'), (30.61111111111111, 'patrick could seize upon potential advantages'), (29.0, 'elections process would ultimately splash back')]
[(28.166666666666668, 'little magical furry unicorn ," according'), (17.666666666666664, 'dog rescue nonprofit organization mac'), (13.666666666666666, 'rescue workers speculate may')]
[(40.45, 'current 2020 democrats -- clinton told bbc radio'), (37.666666666666664, 'former new york mayor michael bloomberg made'), (29.0, '2016 democratic nominee playfully tweeted back')]
[(69.41666666666667, 'money ," tweeted white house press secretary stephanie grisham'), (67.75, 'former white house homeland security adviser tom bossert summed'), (63.416666666666664, 'former national security council russia expert fiona hill said')]
[(2

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
entities = []
num_types = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
for doc in docs:
    temp = []
    spacy_doc = nlp(doc)
    for entity in spacy_doc.ents:
        if entity.label_ not in num_types:
            temp.append(entity)
            
    entities.append(temp)

for entity_list in entities:
    print(entity_list) 

[Entertainment Tonight, the Country Music Association Awards, Carey Hart, Willow, Jameson, Love Me Anyway, Chris Stapleton, Willow, Jameson, Pink, "Carey, Hart]
[Deval Patrick, Massachusetts, Deval Patrick, CNN, Patrick, New Hampshire, Concord, CBS This Morning, Patrick, Patrick, Alabama, Arkansas, New Hampshire, Patrick, Massachusetts, New Hampshire, African American, South Carolina, Democratic, CNN, Patrick, the Democratic Party's, New York City, Michael Bloomberg, Patrick, The New York Times, Patrick, Boston, Barack Obama, Facebook, Diane, Patrick, WBUR, Diane, Bain Capital, Boston, Republican, Massachusetts, US, Utah, Patrick, Romney, Democratic, Democrats, Patrick, Bain Capital, CNN, "Patrick, South Carolina, Texas, New Jersey]
[Meet Narwhal, Missouri, Narwhal, Narwhal the Little Magical Furry Unicorn, Mac, Narwhal, Mac's Mission, Facebook, Narwhal, Facebook, Poppa Smurf, Daschund Terrier, Narwhal, Daschund, Mac, CNN Narwhal, Missouri, Midwest, Mac, Narwhal]
[Democrats, Democratic

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [10]:
for doc in docs:
    results = []
    spacy_doc = nlp(doc)
    for sent in spacy_doc.sents:
        svo = textacy.extract.subject_verb_object_triples(sent)
        results += svo
    
    print(list(set(results)))

[(star, praised, husband), (she, will celebrate, years), (he, follows, me), (she, will be taking, step), (Jameson, 's going, to start)]
[(Patrick, defended, work), (he, has made, decision), (Patrick, had built, team), (Patrick, has missed, to appear), (he, was going, to jump), (that, became, liability), (Patrick, told, friends), (source, tells, CNN), (Patrick, told, WBUR), (frontrunner Democrats, have cast, interests), (Gov. Deval Patrick, told, allies), (he, wanted, to put), (he, could make, minute entry), (He, cited, it), (I, 've left, conscience), (he, planned, bid), (he, seeks, nomination), (Patrick, has missed, deadline), (he, would be entering, race), (he, ruled, bid), (I, 've never taken, job), (Deval Patrick, tells, allies), (Gov. Deval Patrick, told, friends), (Patrick, entered, sector)]
[(it, doesn't cause, him), (face tail, does not bother, Narwhal), (He, seems, other), (organization, received, adoption applications), (spokesperson, told, CNN), (who, has been named, Smurf), 

[(parents, should meet, them), (School officials, are sending, students), (President, has directed, enforcement agencies), (President Mike Pence, gave, to), (she, heard, shot), (gunman, shot, classmates), (sheriff, told, Cooper), (President, has directed, to support), (neighbor, told, him), (student, told, CNN affiliate), (we, 're going, figure), (others, took, cover), (suspect, may have posted, threats), (parents, were told, to meet), (Officials, shepherded, students), (It, scared, us), (I, got, them), (President Mike Pence, gave, condolences), (gunman, shot, himself), (Students, are trained, to take), (we, end, evil), (we, 're going, have), (suspect, shooting, classmates), (I, hate, have), (groups, walked, file), (student, told, KCBS), (enforcement sources, told, CNN.Authorities), (Kent Wegener, told, reporters), (we, 're going, to move), (students, fled, building), (Pence, called, it), (I, wanted, to make), (he, was leaving, house), (that, affects, us), (Emergency workers, took, peo