In [11]:
import requests
import re
import nltk
import sys
import string
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from textblob import TextBlob

feature_data = []
nouns = ['NN', 'NNS', 'NNP', 'NNPS']
stop = stopwords.words('english')
url = 'http://www.latimes.com/local/lanow/la-me-ln-volcano-california-20181025-story.html'
email = """This is the title

A table is a collection of related da0987098709870987234523452345234509870987023452345ta held in a structured format within a database. It consists of columns, and rows. 
In relational databases. and flat John file databases, CONTRACT. a table is a set of data elements (values) using a model of vertical columns (identifiable by name) and horizontal rows, the cell being the unit where a row and column intersect. A table has a specified number of columns, but can have any number of rows.Each row is identified by one or more values appearing in a particular column subset. A specific choice of columns which uniquely identify rows is called the primary key. 
"Table" is another term for "relation". although there is the difference in that. a table is usually a multiset (bag) of rows where a relation is a set and does not allow duplicates. Besides the actual data rows, tables generally have associated with them some metadata, such as constraints on the table or on the values within particular columns.dubious"""
file_text = requests.get(url).text

In [2]:
def tokenize_sentences(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    return sentences

In [3]:
def word_freq_dist(document):
    """Returns a word count frequency distribution"""
    words = nltk.tokenize.word_tokenize(document)
    words = [word.lower() for word in words if word not in stop]
    fdist = nltk.FreqDist(words)
    return fdist

In [4]:
def get_entities(document):
    """Returns Named Entities using NLTK Chunking"""
    entities = []
    sentences = tokenize_sentences(document)

    # Part of Speech Tagging
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            
            # If a chunk has been classified as a named entity, it will be of type nltk.tree.Tree
            if type(chunk) == nltk.tree.Tree:
                    entities.append(' '.join([c[0] for c in chunk]).lower())
    return entities

In [5]:
def build_title_features(lines, line_id):
    
    # Feature vector
    feature_vector = {}

    # Config
    feature_vector["agreement"] = 0
    feature_vector["contract"] = 0
    feature_vector["amendment"] = 0
    line = lines[line_id]
    line_strip_lower = line.strip().lower()
    
    #-------Short Sentence-------#
    feature_vector["short_sentence"] = 0
    blob = TextBlob(str(line))
    short_sentences = list()
    for sentence in blob.sentences:
        if len(sentence.words) <= 7:
            feature_vector["short_sentence"] += 1 
            short_sentences.append(sentence.replace("\n", " "))
    print short_sentences       
    
    #-------Symbols and Numbers-------#
    feature_vector["symbols"] = 0
    feature_vector["numbers"] = 0
    symbols = set(string.punctuation)
    num_symbols = sum([1 for elem in str(line) if elem in symbols])
    num_numbers = 0
    for elem in str(line):
        num_numbers += sum([c.isdigit() for c in elem])
    feature_vector["symbols"] = num_symbols
    feature_vector["numbers"] = num_numbers
    
    #--------Entities------------#
    feature_vector["top_entities"] = 0
    entities = get_entities(str(line))
    top_n_entities = [w for w, c in nltk.FreqDist(entities).most_common(10)]
    feature_vector["top_entities"] = top_n_entities
    print top_n_entities
    
     #--------Nouns------------#
    feature_vector["top_nouns"] = 0
    fdist = word_freq_dist(str(line))
    most_freq_nouns = [w for w, c in fdist.most_common(10)
                       if nltk.pos_tag([w])[0][1] in nouns]
    feature_vector["top_nouns"] = most_freq_nouns
    print most_freq_nouns
    print "\n"
            
    # -----String match------#
    feature_vector["agreement"] = 1 if "agreement" in line or "Agreement" in line or "AGREEMENT" in line else 0
    feature_vector["contract"] = 1 if "contract" in line or "Contract" in line or "CONTRACT" in line else 0
    feature_vector["amendment"] = 1 if "amendment" in line or "Amendment" in line or "AMENDMENT" in line else 0

    return feature_vector

In [6]:
def build_document_title_features(text, window_pre=3, window_post=3):
    
     # Parse all lines
    lines = text.splitlines()
    
    for line_id in range(len(lines)):
        feature_data.append(build_title_features(lines, line_id))


In [9]:
build_document_title_features(email)

[Sentence("This is the title")]
[]
['title']


[]
[]
[]


[Sentence("It consists of columns, and rows.")]
[]
['database', 'da0987098709870987234523452345234509870987023452345ta', 'format', 'held']


[Sentence("In relational databases."), Sentence("and flat John file databases, CONTRACT.")]
['john', 'contract']
['columns', 'number', 'table', 'row', 'rows']


[Sentence(""Table" is another term for "relation"."), Sentence("although there is the difference in that.")]
[]
['table', 'relation', 'rows', 'set']




In [10]:
feature_data

[{'agreement': 0,
  'amendment': 0,
  'contract': 0,
  'numbers': 0,
  'short_sentence': 1,
  'symbols': 0,
  'top_entities': [],
  'top_nouns': ['title']},
 {'agreement': 0,
  'amendment': 0,
  'contract': 0,
  'numbers': 0,
  'short_sentence': 0,
  'symbols': 0,
  'top_entities': [],
  'top_nouns': []},
 {'agreement': 0,
  'amendment': 0,
  'contract': 0,
  'numbers': 49,
  'short_sentence': 1,
  'symbols': 3,
  'top_entities': [],
  'top_nouns': ['database',
   'da0987098709870987234523452345234509870987023452345ta',
   'format',
   'held']},
 {'agreement': 0,
  'amendment': 0,
  'contract': 1,
  'numbers': 0,
  'short_sentence': 2,
  'symbols': 13,
  'top_entities': ['john', 'contract'],
  'top_nouns': ['columns', 'number', 'table', 'row', 'rows']},
 {'agreement': 0,
  'amendment': 0,
  'contract': 0,
  'numbers': 0,
  'short_sentence': 2,
  'symbols': 12,
  'top_entities': [],
  'top_nouns': ['table', 'relation', 'rows', 'set']},
 {'agreement': 0,
  'amendment': 0,
  'contract': 0