In [21]:
from math import log
import re

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords as stpwds
from nltk.cluster.util import cosine_distance
import numpy as np

In [22]:
text = """Just A Rather Very Intelligent System a.k.a JARVIS is created by Tony Stark natural-language and a sophisticated artificial intelligence user interface computer system, named after Edwin Jarvis, the butler who worked for Howard Stark. Though its primary duty is to automate Stark’s Malibu estate, the lifelike program fulfills many other needs for Stark, like being an information source for him, a diagnostic tool, a consultant and a voice of reason in Stark’s life. It was also responsible to provide security for Tony Stark's Mansion and Stark Tower. After creating the Mark II armor, Stark uploaded JARVIS into all of the Iron Man Armors, as well as allowing him to interact with the other Avengers, giving them valuable information during combat. JARVIS may be the one intellect Stark feels most comfortable opening up to. JARVIS can object to Stark’s commands if necessary. JARVIS speaks with a refined British accent, and is capable of back talk, sarcasm and condescension. During the Ultron Offensive, JARVIS was destroyed by Ultron, although his remaining programming codes unknowingly continued to thwart Ultron's plans of gaining access to nuclear missiles. His remains were found by Stark, who uploaded them into a synthetic body made of vibranium and, in conjunction with Ultron's personality and an Infinity Stone. JARVIS' duties were then taken over by FRIDAY."""
formatted_text = re.sub('[^a-zA-Z]', ' ', text)
formatted_text = re.sub(r'\s+', ' ', formatted_text).lower()

In [23]:
sentence_list = sent_tokenize(text)
N = len(sentence_list)
stopwords = stpwds.words('english')

In [24]:
# We compute TF-IDF vectors with custom functions instead of using
# libraries like sk-learn, so that we get to re-use bags for repeated
# important sentence selection without re-computing them from scratch.

# Get words' idf scores
def get_idf(documents, N):
    words_idf = {}
    for word in documents[N]:
        count = 0
        for doc_idx in documents:
            if word in documents[doc_idx]:
                count += 1
        words_idf[word] = log(len(documents) / count)
    return words_idf

In [25]:
# Get sentence vectors
def get_vectors(documents, words_idf, N):
    vectors = {}
    dimensions = len(documents[N])
    word_list = list(documents[N])
    word_to_idx = {word: idx for idx, word in enumerate(word_list)}

    for doc_idx in documents:
        vec = [0] * dimensions
        for word in formatted_sent_list[doc_idx]:
            if word not in word_to_idx:
                continue
            idx = word_to_idx[word]
            vec[idx] += 1
        vectors[doc_idx] = [
            comp * words_idf[word_list[i]] for i, comp in enumerate(vec)
        ]
    return vectors

In [26]:
# Get next important sentence
def get_most_imp_sent(sent_vectors, N):
    max_score = 0
    most_imp_sent_idx = -1
    doc_vec = sent_vectors[N]
    for sent_idx in sent_vectors:
        if sent_idx == N:
            continue
        sent_vec = sent_vectors[sent_idx]
        score = sum(
            [sent_vec[i]*doc_vec[i] for i in range(len(doc_vec))]
        )
        if score > max_score:
            max_score = score
            most_imp_sent_idx = sent_idx
    return most_imp_sent_idx

In [27]:
# Update documents
def update_docs(documents, idx):
    bag = documents[idx]
    del documents[idx]
    for doc_idx in documents:
        documents[doc_idx] -= bag
    return documents

In [28]:
# Construct documents dict
documents = {}
formatted_sent_list = []
sentence_lengths = {}
for i, sent in enumerate([*sentence_list, text]):
    formatted_sent = re.sub('[^a-zA-Z]', ' ', sent)
    formatted_sent = re.sub(r'\s+', ' ', formatted_sent).lower()
    tokens = word_tokenize(formatted_sent)
    formatted_sent_list.append(tokens)
    sentence_lengths[i] = len(tokens)
    bag = set()
    for word in set(tokens):
        if word not in stopwords:
            bag.add(word)
    documents[i] = bag

# Evaluate summary
summary_size = 0.5
num_words = sentence_lengths[N]
summary_length = int(summary_size * num_words)

summary_sent_idxs = []
length = 0
n = 0

# We find out the most relevant sentence, add it to the list of
# summary sentences, then remove its tokens, as well as the
# sentence from the document, then compute the next most
# important sentece in the same way.

# eliminating the selected sentence from the document
# ensures that the next sentence selections will pick the sentences
# with a minimum overlap with current most important sentence.
# This leads to concise summaries without much redundancy.

while n < len(sentence_list) and length < summary_length:
    words_idf = get_idf(documents, N)
    sent_vectors = get_vectors(documents, words_idf, N)
    most_imp_sent_idx = get_most_imp_sent(sent_vectors, N)

    summary_sent_idxs.append(most_imp_sent_idx)
    length += sentence_lengths[most_imp_sent_idx]
    documents = update_docs(documents, most_imp_sent_idx)
    n += 1

summary_sent_idxs.sort()

summary = '\n'.join(
    [sentence_list[idx] for idx in summary_sent_idxs]
)
print(summary)

Just A Rather Very Intelligent System a.k.a JARVIS is created by Tony Stark natural-language and a sophisticated artificial intelligence user interface computer system, named after Edwin Jarvis, the butler who worked for Howard Stark.
Though its primary duty is to automate Stark’s Malibu estate, the lifelike program fulfills many other needs for Stark, like being an information source for him, a diagnostic tool, a consultant and a voice of reason in Stark’s life.
After creating the Mark II armor, Stark uploaded JARVIS into all of the Iron Man Armors, as well as allowing him to interact with the other Avengers, giving them valuable information during combat.
During the Ultron Offensive, JARVIS was destroyed by Ultron, although his remaining programming codes unknowingly continued to thwart Ultron's plans of gaining access to nuclear missiles.
