In [3]:
import os
import numpy

In [4]:
TRAIN = os.listdir('../Dataset/all/training/')
DEV = os.listdir("../Dataset/all/validation/")
TEST = os.listdir('../Dataset/all/test/')
TRAIN_FOLDER = '../Dataset/all/training/'
DEV_FOLDER = '../Dataset/all/validation/'
TEST_FOLDER = '../Dataset/all/test/'

In [5]:
len(TRAIN), len(TEST), len(DEV)

(277554, 11443, 13367)

## Model dataset file:

### Introduction

This dataset contains CNN and Dailymail articles used for training a summarization system. The script used to create the dataset is modified from the release of Hermann et al. 2015.

### Format:

Each file contains four parts separated by ‘\n\n’. They are
* url of the original article;
* sentences in the article and their labels (for sentence-based extractive summarization);
* extractable highlights (for word extraction-based abstractive summarization);
* named entity mapping.

### Sentence labels

There are three labels for the sentences: 1, 2 and 0. 
* ** 1 **—-sentence should extracted; 
* ** 2 **--sentence might be extracted; 
* ** 0 **—-sentence shouldn't be extracted.

### Extractable highlights

The extractable highlights are created by examining if a word (or its morphological transformation) in the highlight appears in the article or a general purpose stop-word list, which together constitute the output space (i.e., the allowed vocabulary during summary generation).

In [9]:
entities = {}
docs = []

In [10]:
def file_parse(file):
    global entities
    # generating data
    
    contents = file.read()
    parts = contents.split('\n\n')
    lines = parts[1].split('\n')
    output = [int(line[-1]) for line in lines]
    lines = [line[:-1].rstrip() for line in lines]
    
    # storing entities
    entity_map = parts[3].split('\n')
    for i in entity_map:
        id, name = i.split(":")[:2]
        entities[id] = name
    file.close()
    return {"lines": lines, "output": output, "summary": parts[2].split('\n')}

In [11]:
def get_processed_files(folder, file_names):
    docs = []
    summaries = []
    for file in file_names:
        try:
            docs.append(file_parse(open(folder + file)))
        except ValueError:
            pass
    return docs

In [12]:
TRAIN = get_processed_files(TRAIN_FOLDER, TRAIN)

In [13]:
TEST = get_processed_files(TEST_FOLDER, TEST)
DEV = get_processed_files(DEV_FOLDER, DEV)