In [81]:
import re
import math
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('inaugural')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package inaugural to
[nltk_data]     C:\Users\qmok9\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\inaugural.zip.


True

In [71]:
def get_word_frequencies(text):
    """Get word frequencies from a text.
    
    Parameters
    ----------
    text : str
        The text to be analyzed
        
    Returns
    -------
    result : dict
        A dictionary with relative word frequencies
    """
    clean_text = re.sub('[^A-Za-z]+', ' ', text)
    tokenized = nltk.word_tokenize(clean_text.lower())
    stopwords = nltk.corpus.stopwords.words('english')

    result = {}
    for word in tokenized:
        if word not in stopwords:
            if word not in result.keys():
                result[word] = 1
            else:
                result[word] += 1
    
    max_frequncy = max(result.values())
    for word in result.keys():
        result[word] = (result[word]/max_frequncy)
        
    return result


def get_sentence_scores(text, word_frequencies):
    """Get all sentence scores based on word frequencies
    
    Parameters
    ----------
    text : str
        The text to be analyzed
        
    word_frequencies : dict
        A dictionary with relative word frequencies    
        
    Returns
    -------
    result : dict
        A dictionary containing a score for every sentence
    """
    sentences = nltk.sent_tokenize(text)
    result = {}
    for sentence in sentences:
        words = nltk.word_tokenize(sentence.lower())
        for word in words:
            if word in word_frequencies.keys():
                if sentence not in result.keys():
                    result[sentence] = word_frequencies[word]
                else:
                    result[sentence] += word_frequencies[word]
                    
    return result


def get_summary(text, max_length):
    """Get a summary of given text
    
    Parameters
    ----------
    text : str
        The text to be summarized
        
    max_length : int
        Maximum length of the summary measured in number of tokens  
        
    Returns
    -------
    result : str
        The summarized text
    """
    sentence_scores = get_sentence_scores(text, get_word_frequencies(text))
    result = []
    current_length = 0
    while current_length <= max_length and len(sentence_scores) > 0:
        summary_sent = max(sentence_scores, key=sentence_scores.get)
        sent_length = len(nltk.word_tokenize(summary_sent.lower()))
        if current_length != 0 and current_length + sent_length > max_length:
            break
        else:
            result.append(summary_sent)
            current_length += sent_length
            sentence_scores.pop(summary_sent)
    
    return ' '.join(result)

In [91]:
def slice_text(text, max_length):
    """Slice the given text into smaller parts regarding max_length
    
    Parameters
    ----------
    text : str
        The text to be sliced
        
    max_length : int
        Maximum length of each slice measured in number of tokens  
        
    Returns
    -------
    result : list
        A list with text slices
    """
    sentences = nltk.sent_tokenize(text)
    result = []
    temp = []
    current_length = 0
    for sentence in sentences:
        sent_length = len(nltk.word_tokenize(sentence))
        if current_length != 0 and current_length + sent_length > max_length:
            result.append(' '.join(temp))
            current_length = 0
            temp = []
        temp.append(sentence)
        current_length += sent_length
    if len(temp) > 0:
        result.append(' '.join(temp))
    return result

def summarize_text(text, max_length):
    """Summarize a long text hierarchically by first summarizing slices
    of the text, then putting the summaries together
    
    Parameters
    ----------
    text : str
        The text to be summarized
        
    max_length : int
        Maximum length of the summary measured in number of tokens  
        
    Returns
    -------
    result : str
        The summarized text
    """
    text_length = len(nltk.word_tokenize(text))
    if text_length < max_length:
        return text
    else:
        slice_length = math.ceil(text_length / math.ceil(text_length / max_length))
        summary_length = slice_length / 2
        sliced_texts = slice_text(text, slice_length)
        summaries = []
        for sliced_text in sliced_texts:
            summaries.append(get_summary(sliced_text, summary_length))
        result = ' '.join(summaries)
        if len(nltk.word_tokenize(result)) <= max_length:
            return result
        else:
            return summarize_text(result, max_length)

def write_document(text, title):
    f = open(title + ".txt", "w")
    f.write(text)
    f.close()

In [92]:
from nltk.corpus import inaugural
CONTEXT_WINDOW_LENGTH = 400 #reduced for testing
text1 = inaugural.raw('1789-Washington.txt')
text2 = inaugural.raw('1793-Washington.txt')
write_document(summarize_text(text1, CONTEXT_WINDOW_LENGTH), "summary1")
write_document(summarize_text(text2, CONTEXT_WINDOW_LENGTH), "summary2")