Reference: https://machinelearningmastery.com/prepare-news-articles-text-summarization/

In [4]:
directory = 'dataset/cnn/stories2/'

In [5]:
from os import listdir
import string
 
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights
 
# load all stories in a directory
def load_stories(directory):
    stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        # load document
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        # store
        stories.append({'story':story, 'highlights':highlights})
    return stories
 
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare a translation table to remove punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # strip source cnn office if it exists
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN)'):]
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [w.translate(table) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    # remove empty strings
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned
 
# load stories
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))
 
# clean stories
for example in stories:
    example['story'] = clean_lines(example['story'].split('\n'))
    example['highlights'] = clean_lines(example['highlights'])

Loaded Stories 6


In [10]:
#save to file
from pickle import dump
dump(stories, open(directory+'cnn_dataset.pkl', 'wb'))

In [11]:
# load from file
from pickle import load
stories = load(open('cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 6


In [12]:
stories

[{'story': ['one of the most cosmopolitan cities in africa johannesburg is multicultural and multifaceted make the most of your time there with these tips',
   'the apartheid museum is an often disturbing view of south africas troubled past',
   'apartheid museum its a disturbing experience but a visit to apartheid museum is essential if you want to understand the citys troubled past',
   'on arrival visitors are randomly allocated a pass labeling them as white or nonwhite referencing the apartheid pass laws that obliged black south africans to carry identity passes at all times',
   'harrowing multimedia displays and news footage recreate the brutality of the apartheid era while nooses hanging from the ceiling represent the political prisoners executed under the system',
   'less bleak are the tributes to those who fought against the injustice of that era paving the way for equality and democracy in todays south africa the museum is open between am and pm gmt tuesday to sunday admissi