### Keyword Extraction using TextRank

In [14]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [16]:
# nlp = spacy.load('en_core_web_sm')

import en_core_web_sm
nlp = en_core_web_sm.load()

In [17]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [18]:
keyphrase_extractor = TextRank4Keyword()


In [24]:
import json
json_data=open("anly610_amazon.json").readlines()
feeds_read_from_file = []
for line in json_data:
    feeds_read_from_file.append(json.loads(line))

In [27]:
text=feeds_read_from_file[1]['text']

In [31]:
feeds_read_from_file[1]['text']

'Gotta buy ‘em all! Photo illustration by Slate. Photos by Mark Wilson/Getty Images, Pablo Blazquez Dominguez/Getty Images, and Amazon. Once a year, the National Football League canvasses every business, political campaign, and private equity bust-out operation in the country to ask them one important question: Do you love professional football so much that you are willing to pay $10 million a minute for access to the personal eyes and ears of your fellow football fans? The result is a comprehensive list of the products and services true football fans should purchase in 2020, both to demonstrate their love of the sport and to reward the corporations who helped fund the Big Game. It can be a daunting task, so to help you out, this year Slate is building a shopping list of every product advertised during Super Bowl LIV, complete with the estimated price and a running total to buy one of everything. Are you a big enough football fan to collect the whole set? A few notes on the methodology

In [28]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN',"ADP"], window_size=8, lower=False)
tr4w.get_keywords(10)

football - 4.91591746553
Bowl - 3.44579789004
Super - 3.30522113069
year - 2.81920886959
game - 2.4719117412
price - 2.44153959913
ads - 1.95888473352
Slate - 1.70751795612
question - 1.68280329404
list - 1.46968282256
fans - 1.40249057954
- - 1.38811884021


### Extractive Text Summarization

In [29]:
!pip install sumy

Collecting sumy
  Downloading https://files.pythonhosted.org/packages/61/20/8abf92617ec80a2ebaec8dc1646a790fc9656a4a4377ddb9f0cc90bc9326/sumy-0.8.1-py2.py3-none-any.whl (83kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading https://files.pythonhosted.org/packages/a2/55/8f8cab2afd404cf578136ef2cc5dfb50baa1761b68c9da1fb1e4eed343c9/docopt-0.6.2.tar.gz
Collecting breadability>=0.1.20 (from sumy)
  Downloading https://files.pythonhosted.org/packages/ad/2d/bb6c9b381e6b6a432aa2ffa8f4afdb2204f1ff97cfcc0766a5b7683fec43/breadability-0.1.20.tar.gz
Collecting pycountry>=18.2.23 (from sumy)
  Downloading https://files.pythonhosted.org/packages/16/b6/154fe93072051d8ce7bf197690957b6d0ac9a21d51c9a1d05bd7c6fdb16f/pycountry-19.8.18.tar.gz (10.0MB)
Building wheels for collected packages: docopt, breadability, pycountry
  Running setup.py bdist_wheel for docopt: started
  Running setup.py bdist_wheel for docopt: finished with status 'done'
  Stored in directory: C:\Users\subha\AppData\Local\pip\Ca

You are using pip version 9.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [30]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

class TextSummary(object):

    def __init__(self, feeds_str, num_sents):
        self.summary = str()
        
        parser = PlaintextParser.from_string(feeds_str, Tokenizer("english"))
        summarizer = LexRankSummarizer()

        sentences = summarizer(parser.document, num_sents)  # Summarize the document with 5 sentences
        for sentence in sentences:
            self.summary += (sentence.__unicode__())

    def output(self):
        return self.summary

In [32]:
feeds_read_from_file[1]['text']

'Gotta buy ‘em all! Photo illustration by Slate. Photos by Mark Wilson/Getty Images, Pablo Blazquez Dominguez/Getty Images, and Amazon. Once a year, the National Football League canvasses every business, political campaign, and private equity bust-out operation in the country to ask them one important question: Do you love professional football so much that you are willing to pay $10 million a minute for access to the personal eyes and ears of your fellow football fans? The result is a comprehensive list of the products and services true football fans should purchase in 2020, both to demonstrate their love of the sport and to reward the corporations who helped fund the Big Game. It can be a daunting task, so to help you out, this year Slate is building a shopping list of every product advertised during Super Bowl LIV, complete with the estimated price and a running total to buy one of everything. Are you a big enough football fan to collect the whole set? A few notes on the methodology

In [34]:
text_to_sum = TextSummary(text,5)
print(text_to_sum.output())

Once a year, the National Football League canvasses every business, political campaign, and private equity bust-out operation in the country to ask them one important question: Do you love professional football so much that you are willing to pay $10 million a minute for access to the personal eyes and ears of your fellow football fans?It can be a daunting task, so to help you out, this year Slate is building a shopping list of every product advertised during Super Bowl LIV, complete with the estimated price and a running total to buy one of everything.All you have to do is leave your TV on after the game, and you’ve already started your journey toward true fooball greatness!And you’ve got plenty of those, right?That’s a running total of $90.40, plus your soul, your future, and your children’s lives, which seems like a lot at first, but ask yourself this: is any price too high to pay to support football?
