# TOC

  __Chapter 5 - NLP_applications__

1. [Import](#Import)
1. [News article summarization](#News-article-summarization)


# Import

<a id = 'Import'></a>

In [3]:
# Standard libary and settings
import os
import sys
import importlib
import itertools
import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# Data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# Modeling extensions
import nltk

# Visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_style("whitegrid")

# News article summarization

Summarizing a news article is an interesting application of NLP that requires a process that can attain deep understanding of not only individual sentences, but also how sentences relate to each other to form an overall message. We also want to be able to understand genre and theme.

This can be attained through an approach that ranks individual sentences based on their importance. Generally speaking, a sentence that has a higher number of entities and nouns has greater importance. We can use this rule of thumb to create an importance score.

This is a simplistic but powerful rule that can be expanded upon by weighting senetences at the beginning of the article higher than the sentences at the end of article. This is based on the assumption that an article will typically fronload the details that tend to summarize the article and the sentences following the introduction will provide the details that expand upon the summary.

Another possibility is to evaluate the term frequency - inverse document frequency (TF-IDF) of each and every word. This enables us to identify discriminatory words, which assumes that sentences that include discriminatory words are especially important. By calculating the TF-IDF for each word, we can determine the average score of each sentence and prioritize accordingly.

<a id = 'News-article-summarization'></a>

In [None]:
# new article summarization example
import sys

f = open("nyt.txt", "r")
news_content = f.read()

results = []
for sent_no, sentence in enumerate(nltk.sent_tokenize(news_content)):
    no_of_tokens = len(nltk.word_tokenize(sentence))
    print(no_of_tokens)

    tagged = nltk.pos_tag(nltk.word_tokenize(sentence))

    # count the numeber of nouns
    no_of_nouns = len([word for word, pos in tagged if pos in ["NN", "NNP"]])

    # count named entities
    ners = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False)
    no_of_ners = len([chunk for chunk in ners if hasattr(chunk, "node")])
    score = (no_of_ners + no_of_nouns) / float(no_of_tokens)

    results.append((sent_no, no_of_tokens, no_of_nouns, no_of_ners, score, sentence))

In [None]:
# print top scores
for sent in sorted(results, key=lambda x: x[4], reverse=True):
    print(sent[5])

In [7]:
# TF-IDF example
from sklearn.feature_extraction.text import TfidfVectorizer

results = []
news_content = """
Mr. Obama planned to promote the effort on Monday during a visit to Camden, N.J. The ban is
part of Mr. Obama's push to ease tensions between law enforcement and minority \communities
in reaction to the crises in Baltimore; Ferguson, Mo. We are, without a doubt, sitting at a 
defining moment in American policing, Ronald L. Davis, the director of the Office of 
Community Oriented Policing Services at the Department of Justice, told reporters in a 
conference call organized by the White House
"""

sentences = nltk.sent_tokenize(news_content)
vectorizer = TfidfVectorizer(
    norm="l2", min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True
)

sklearn_binary = vectorizer.fit_transform(sentences)
print(vectorizer.get_feature_names())

print(sklearn_binary.toarray())

for i in sklearn_binary.toarray():
    results.append(i.sum() / float(len(i.nonzero()[0])))

['american', 'and', 'are', 'at', 'baltimore', 'ban', 'between', 'by', 'call', 'camden', 'communities', 'community', 'conference', 'crises', 'davis', 'defining', 'department', 'director', 'doubt', 'during', 'ease', 'effort', 'enforcement', 'ferguson', 'house', 'in', 'is', 'justice', 'law', 'minority', 'mo', 'moment', 'monday', 'mr', 'obama', 'of', 'office', 'on', 'organized', 'oriented', 'part', 'planned', 'policing', 'promote', 'push', 'reaction', 'reporters', 'ronald', 'services', 'sitting', 'tensions', 'the', 'to', 'told', 'visit', 'we', 'white', 'without']
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30993994 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.30993994 0.         0.30993994 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.30993994 0.20757039 0.20757039 0.
  0.         0.30993994 0.         0.         0.         0.30993994
  0.  