In [96]:
import requests
import re
import nltk
import random
import sys
import string
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from textblob import TextBlob

"""config data"""
url = 'http://www.latimes.com/local/lanow/la-me-ln-volcano-california-20181025-story.html'
nouns = ['NN', 'NNS', 'NNP', 'NNPS']
stop = stopwords.words('english')
email = """
PI team,

The Global Advisory Solutions Conference is tomorrow, Friday October 26th at 12:00-2:00pm EST.  
This is exclusive virtual conference will give you insights on why are we moving to solutions, 
how to activate solutions in the market and what the move to solutions means to you.  
Please use the link below to register for the event.

Regards,
Geoff

"""

def download_document(url):
    """Extracts title and all text stored in paragraph tags"""
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    title = soup.find('title').get_text()
    document = ' '.join([p.get_text() for p in soup.find_all('p')])
    
    print "\n html title:"
    print title
    return document

def clean_document(document):
    """Remove enronious characters, whitespace and stop words"""
    document = re.sub('[^A-Za-z .-]+', ' ', document)
    document = ' '.join(document.split())
    document = ' '.join([i for i in document.split() if i not in stop])
    return document

def tokenize_sentences(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    return sentences

def word_freq_dist(document):
    """Returns a word count frequency distribution"""
    words = nltk.tokenize.word_tokenize(document)
    words = [word.lower() for word in words if word not in stop]
    fdist = nltk.FreqDist(words)
    return fdist

def get_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

def get_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

def get_entities(document,entity_label):
    """Returns Named Entities using NLTK Chunking"""
    entities = []
    sentences = tokenize_sentences(document)

    # Part of Speech Tagging
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            
            # If a chunk has been classified as a named entity, it will be of type nltk.tree.Tree
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == entity_label:
                    entities.append(' '.join([c[0] for c in chunk]).lower())
    return entities

def get_subject(document,n):
    """NER + Frequent Nouns to give subject"""
    # labels: PERSON, LOCATION, ORGANIZATION, MISC, MONEY, NUMBER, ORDINAL, PERCENT, DATE, TIME, DURATION, SET
    entity_label = 'PERSON'
    
    # Get n most frequent Nouns
    fdist = word_freq_dist(document)
    most_freq_nouns = [w for w, c in fdist.most_common(n)
                       if nltk.pos_tag([w])[0][1] in nouns]

    # Get Top n name entities
    entities = get_entities(str(document), entity_label)
    top_n_entities = [w for w, c in nltk.FreqDist(entities).most_common(n)]

    # Get the subject noun by looking at the intersection of top n entities
    # and most frequent nouns.
    subject_nouns = [entity for entity in top_n_entities
                    if entity.split()[0] in most_freq_nouns]
    
    print "\n top_n_entities:"
    print top_n_entities
    print "\n most_freq_nouns:"
    print most_freq_nouns
    
    return subject_nouns

def get_symbols_numbers_density(document):
    symbols = set(string.punctuation)
    num_symbols = sum([1 for elem in document if elem in symbols])
    num_numbers = 0
    for elem in document:
        num_numbers += sum([c.isdigit() for c in elem])
    print [num_symbols, num_numbers]


def get_short_sentences(document, sentence_size):
    blob = TextBlob(str(document))
    short_sentences = list()
    for sentence in blob.sentences:
          if len(sentence.words) <= sentence_size:
            short_sentences.append(sentence.replace("\n", " "))

    # for item in random.sample(short_sentences, 10):
    print short_sentences

if __name__ == '__main__':
    document = download_document(url)
    top_n_threshold = 20
    sentence_size = 7
    
    #print document
    document = clean_document(document)
    numbers = get_phone_numbers(str(document))
    emails = get_email_addresses(str(document))
    subject = get_subject(document,top_n_threshold)
    
    print"\nsymbols_numbers_density:"
    get_symbols_numbers_density(str(document))
    print"\nshort sentences:"
    get_short_sentences(document, sentence_size)
    print"\nnumbers:"
    print numbers
    print"\nemails:"
    print emails
    print"\nsubjects:"
    print subject


 html title:
3 California volcanoes are at the top of federal volcano threat list - Los Angeles Times

 top_n_entities:
['siskiyou modoc', 'helens', 'mammoth lakes', 'medicine lake', 'long valley', 'lassen peak', 'elko nev', 'salton buttes imperial county clear lake volcanic field lake county', 'shasta lassen', 'mauna loa', 'mount rainier mt', 'san francisco', 'pacific northwest', 'hawaii big island kilauea', 'salton buttes', 'iceland eyjafjallaj', 'mount hood crater', 'shasta siskiyou county lassen volcanic center shasta county long valley caldera', 'ash', 'eureka sacramento']
20

 most_freq_nouns:
[u'volcanic', u'california', u'eruption', u'volcanoes', u'volcano', u'ash', u'lake', u'areas', u'shasta', u'lassen', u'miles', u'county', u'valley', u'mt', u'helens', u'years', u'risk']

symbols_numbers_density:
[50, 0]

short sentences:
[Sentence("Shasta towns Mount Shasta Weed."), Sentence("St. Helens erupted ."), Sentence("California volcanoes prolific prehistoric times."), Sentence("Mt

In [94]:
type(sentence)

textblob.blob.Sentence

In [84]:

def symbols_and_numbers_density(mentions):
    symbols = set(string.punctuation)
    num_symbols = sum([1 for elem in mentions if elem in symbols])
    num_numbers = 0
    for elem in mentions:
        num_numbers += sum([c.isdigit() for c in elem])
    print [num_symbols, num_numbers]

In [86]:
symbols_and_numbers_density(str(document))

[10, 9]
