# Extracting Information from Text Data Assignment

In [1]:
#!pip install spacy
#!pip install textacy --quiet
#!pip install rake_nltk --quiet
#!conda install -c conda-forge textacy
#!pip install gensim

In [2]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [3]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = 'lite_cnn/'
DOC_PATTERN = r'articles_text.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 14
Number of paragraphs: 14
Number of sentences: 427
Number of words: 13668
Vocabulary: 2927
Avg chars per word: 5.0
Avg words per sentence: 32.0


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [6]:
for doc in docs:
    keys = keywords(doc, words=5, lemmatize=True, split=True)
    print(keys)

['pink', 'carey', 'tour', 'year', 'pretty']
['patrick', 'primary', 'telling', 'democrats', 'cnn']
['narwhal', 'tail', 'puppy', 'unicorn', 'dogs']
['states', 'democratic', 'bloomberg', 'told', 'running']
['republican', 'taylor', 'rep', 'presidents', 'ukraine']
['muslimness', 'people', 'skin', 'white', 'religion']
['trump', 'new', 'said', 'republican', 'media']
['said', 'police', 'brown', 'jones', 'roanoke']
['trump', 'hotels', 'office', 'profit', 'owned']
['keys', 'grammys', 'award', 'alicia', 'power']
['americans', 'republican', 'trump', 'ukrainians', 'investigate']
['student', 'said', 'told', 'pence', 'schools']
['crows', 'disney', 'american', 'old', 'movie']
['protester', 'police', 'chinese', 'new', 'kong']


### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [7]:
r = Rake()
for doc in docs:
    key_phrases = r.extract_keywords_from_text(doc)
    key_phrases = r.get_ranked_phrases_with_scores()
    print(key_phrases[0:3])

[(32.5, 'country music association awards red carpet'), (17.166666666666664, 'school soon ," pink said'), (16.5, 'country star chris stapleton')]
[(94.03333333333333, 'former new york city mayor michael bloomberg stepped forward last week'), (30.61111111111111, 'patrick could seize upon potential advantages'), (29.0, 'elections process would ultimately splash back')]
[(28.166666666666668, 'little magical furry unicorn ," according'), (17.666666666666664, 'dog rescue nonprofit organization mac'), (13.666666666666666, 'rescue workers speculate may')]
[(40.45, 'current 2020 democrats -- clinton told bbc radio'), (37.666666666666664, 'former new york mayor michael bloomberg made'), (29.0, '2016 democratic nominee playfully tweeted back')]
[(69.41666666666667, 'money ," tweeted white house press secretary stephanie grisham'), (67.75, 'former white house homeland security adviser tom bossert summed'), (63.416666666666664, 'former national security council russia expert fiona hill said')]
[(2

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [8]:
nlp = spacy.load('en_core_web_sm')


In [9]:
entities = []
for doc in docs:
    spacy_doc = nlp(doc)
    entities.append([[entity.text, entity.label_] for entity in spacy_doc.ents])

entities[0]

[['Entertainment Tonight', 'WORK_OF_ART'],
 ['the Country Music Association Awards', 'ORG'],
 ['Carey Hart', 'PERSON'],
 ['Willow', 'PERSON'],
 ['8', 'DATE'],
 ['Jameson', 'PERSON'],
 ['2.Pink', 'CARDINAL'],
 ['Love Me Anyway', 'WORK_OF_ART'],
 ['Chris Stapleton', 'PERSON'],
 ['two and a half years', 'DATE'],
 ['Willow', 'PERSON'],
 ['Jameson', 'PERSON'],
 ['Pink', 'ORG'],
 ['the year', 'DATE'],
 ['14 years', 'DATE'],
 ['January', 'DATE'],
 ['"Carey', 'ORG'],
 ['Hart', 'PERSON'],
 ['10th', 'ORDINAL'],
 ['more than $397 million', 'MONEY']]

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.