In [1]:
from os import listdir
import os as os
import shutil
import string
import pickle
# Helper libraries
import collections
import hashlib
import nltk
import json 
import numpy as np



In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    #print(filename)
    file = open(filename,'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [3]:
def write_story_highlight_diff_files(filename, story, highlights):
    h_filename = filename.replace(".story", ".highlights")
    print(h_filename)
    file = open(filename,'w')
    story = "@story: \n" + story
    file.write(story)
    file.close()
    h_file = open(h_filename, 'w')
    h_file.write("@abstract: \n")
    highlights = list(map(lambda s: "<s>"+s+"<\s>", highlights))
    h_file.write(' '.join(highlights))
    h_file.close()

In [4]:
def write_story_highlight(filename, story, highlights):
    file = open(filename,'a+')
    file.write("@abstract: \n")
    highlights = list(map(lambda s: "<s>"+s+"<\s>", highlights))
    file.write(' '.join(highlights))                 
    story = "\n@story: \n" + story
    file.write(story)
    file.write("\n\n")
    file.close()

In [5]:
# split a document into news story and highlights
def split_story(doc):
    # find first highlight
    index = doc.find('@highlight')
    # split into story and highlights
    story, highlights = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

In [6]:

# load all stories in a directory
def load_stories(directory):
    stories = list()
    rs_dir = "./processed"
    if  os.path.exists(rs_dir):
        shutil.rmtree(rs_dir)
    os.makedirs(rs_dir)
    list_of_files = listdir(directory)
    count1 = -1
        
    for count,name in enumerate(list_of_files):
        if ".story" not in name:
            continue
        urlhash = os.path.splitext(name)
        filename = directory + '/' + name
        # load document
        # print(filename)
        doc = load_doc(filename)
        # split into story and highlights
        story, highlights = split_story(doc)
        if(story.isspace()):
            print(story+" is empty!!!! "+filename)
            continue
        if (count%1000 == 0):
            count1 += 1
            trainName = 'train'+ str(count1).zfill(3) 
        else:
            trainName
        fn = rs_dir + '/' + trainName
        #write_story_highlight(fn, story, highlights)
        # store
        stories.append({'hash':urlhash[0], 'story':story, 'highlights':highlights})
    return stories


In [7]:
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare a translation table to remove punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # strip source cnn office if it exists
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN)'):]
        index = line.find('CNN')
        if index > -1:
            line = line[index+len('CNN'):]
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [w.translate(table) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    # remove empty strings
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned

Initially we need to load up the data, the data can be found at CNN data link. We untar it in the same directory : tar xvf cnn_stories.tgz on the command line. After we load up the data, we separate out to the story and highlight portion and store it as a list of dictionaries.

In [9]:
os.getcwd()

'/Users/smanmoha/Desktop/project_summarization'

In [10]:
# load stories
try:
    file = open('cnn/stories/stories.json')
    stories = json.load(file)
    file.close()
except:
    directory = 'cnn/stories'
    stories = load_stories(directory)
    print('Loaded Stories %d' % len(stories))
    # clean stories
    for i,example in enumerate(stories):
        example['story'] = clean_lines(example['story'].split('\n'))
        example['highlights'] = clean_lines(example['highlights'])
        stories[i] = example
    with open('cnn/stories/stories.json', 'w') as outfile:
        json.dump(stories, outfile)


In [16]:
file = open('cnn/stories/stories.json')
stories1 = json.load(file)
file.close()
len(stories1)

92465

Now splitting to an array of story sentences and the corresponding highlight sentences we get :

In [12]:
stories_list = list(map(lambda s: s['story'], stories))
highlights_list = list(map(lambda s: s['highlights'], stories))

In [13]:
mean_story_length = np.mean(list(map(lambda s: len(s), stories_list)))

In [14]:
mean_sentence_length_of_every_story = np.mean(list(map(lambda s: len(s), [s for t in stories_list for s in t])))

In [15]:
print("Mean length of story is {:.3f} sentences and the Mean sentence length of every story is {:.3f} charachters".format(mean_story_length, mean_sentence_length_of_every_story))

Mean length of story is 21.241 sentences and the Mean sentence length of every story is 171.595 charachters


Now separating the datasets to training, dev and test datasets :

In [17]:
def get_train_test_stories(stories, highlights, split=0.8, shuffle=False):
    """Generate train/test split for unsupervised tasks.

    Args:
      stories(list): list of stories
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.

    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    sentences = np.array(list(stories), dtype=list)
    fmt = (len(sentences), sum(map(len, sentences)))
    print("Loaded {:,} stories ({:g} sentences)".format(*fmt))

    if shuffle:
        rng = np.random.RandomState(shuffle)
        all_sents = list(zip(sentences, highlights))
        rng.shuffle(all_sents)
        sentences, highlights = zip(*all_sents)
       # rng.shuffle(sentences)  # in-place
       # rng.shuffle(highlights)
    split_idx = int(split * len(sentences))
    test_dev_split_idx = int((len(sentences) - split_idx)/2)+ split_idx
    print(split_idx, test_dev_split_idx)
    train_stories = sentences[:split_idx]
    dev_stories = sentences[split_idx:test_dev_split_idx]
    test_stories = sentences[test_dev_split_idx:]
    train_highlights = highlights[:split_idx]
    dev_highlights = highlights[split_idx:test_dev_split_idx]
    test_highlights = highlights[test_dev_split_idx:]
    
    
    fmt = (len(train_stories), sum(map(len, train_stories)))
    print("Training set: {:,} stories ({:,} sentences)".format(*fmt))
    fmt = (len(dev_stories), sum(map(len, dev_stories)))
    print("Dev set: {:,} stories ({:,} sentences)".format(*fmt))
    fmt = (len(test_stories), sum(map(len, test_stories)))
    print("Test set: {:,} stories ({:,} sentences)".format(*fmt))

    return train_stories, dev_stories, test_stories, train_highlights, dev_highlights, test_highlights


In [18]:
train_stories_list, dev_stories_list, test_stories_list, train_highlights_list, dev_highlights_list, test_highlights_list \
= get_train_test_stories(stories_list , highlights_list, split=0.9, shuffle=42)

Loaded 92,465 stories (1.96402e+06 sentences)
83218 87841
Training set: 83,218 stories (1,766,718 sentences)
Dev set: 4,623 stories (99,117 sentences)
Test set: 4,624 stories (98,180 sentences)


In [19]:
train_stories_list[0]

['the end is closer than the beginning',
 'ron rupert grint left harry daniel radcliffe second from right and hermione emma watson in the new film',
 'harry potter and his friends at hogwarts are now in their sixth year of seven at the school theyve seen a lot of changes particularly as the influence of the reawakened voldemort that is he who must not be named has made itself known',
 'and the movie series itself is now nearing its conclusion harry potter and the halfblood prince which comes out wednesday is likewise the sixth movie in the series based on jk rowlings seven harry potter books',
 'there is one benefit to having such history director david yates says pretty much everybody going to see halfblood prince is familiar with the characters whether through the books or the movies',
 'we made a decision we kind of crossed a line actually i think on this movie where we said you know this is the sixth one in the series its the most popular franchise probably in history do we stop an