## Notebook for Loading the CNN_Dailymail Dataset from HuggingFace

#### Imports

In [2]:
import datasets 
from datasets import load_dataset
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

Data originates from [the CNN Dailymail huggingface dataset.](https://huggingface.co/datasets/cnn_dailymail)

#### Load Data

In [None]:
small_dataset = True

In [8]:
dataset = load_dataset("cnn_dailymail", "3.0.0") 

if small_dataset:
    max_len = 100 # Small Dataset
else:
    max_len = 1000 # Large Dataset

In [9]:
# retrieve up to the specified number of rows 
# Data is originially in a DatasetDict object with separate Datasets for train, test, and validation. We will be using a subset of the train dataset. 
dataset = dataset['train'][:max_len]

In [10]:
# final data structure 
data = []

In [11]:
# Data is originally in an Arrow DatasetObject. By looping over all three arrays in Dataset object and zipping, we can extract each article, highlight, and id and place into a dictionary in a list. 
for article, highlight, id in zip(dataset['article'], dataset['highlights'], dataset['id']):
    data.append({'id' : id, 'highlights' : highlight, 'article' : article})

#### Split Data

In [14]:
train, test = train_test_split(data, test_size=.2)
train, val = train_test_split(train, test_size=.25)

In [16]:
print("Train: " + str(len(train)))
print("Test: " + str(len(test)))
print("Val: " + str(len(val)))

Train: 60
Test: 20
Val: 20


#### Save Preprocessed Data

In [22]:
if small_dataset:
    np.savez('../data/cnn_dailymail_small.npz', train=train, test=test, val=val)
else:
    np.savez('../data/cnn_dailymail.npz' , train=train, test=test, val=val)

#### Example Code for Loading Data

In [34]:
# How to load the data
data = np.load('../data/cnn_dailymail_small.npz', allow_pickle=True)
train = data['train']
test = data['test']
val = data['val']
