## Notebook for Loading the CNN_Dailymail Dataset from HuggingFace

#### Imports

In [2]:
import datasets 
import pandas as pd
from datasets import load_dataset
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

Data originates from [the CNN Dailymail huggingface dataset.](https://huggingface.co/datasets/cnn_dailymail)

#### Load Data

In [52]:
small_dataset = False

In [53]:
dataset = load_dataset("cnn_dailymail", "3.0.0") 

if small_dataset:
    max_len = 100 # Small Dataset
else:
    max_len = 1000 # Large Dataset

In [54]:
# retrieve up to the specified number of rows 
# Data is originially in a DatasetDict object with separate Datasets for train, test, and validation. We will be using a subset of the train dataset. 
dataset = dataset['train'][:max_len]

In [55]:
# final data structure 
data = []

In [56]:
# Data is originally in an Arrow DatasetObject. By looping over all three arrays in Dataset object and zipping, we can extract each article, highlight, and id and place into a dictionary in a list. 
for article, highlight in zip(dataset['article'], dataset['highlights']):
    data.append({'article' : article, 'highlights' : highlight})

#### Convert to Pandas Dataframe

In [57]:
data = pd.DataFrame.from_dict(data)

In [58]:
data.head()

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


#### Split Data

In [59]:
train, test = train_test_split(data, test_size=.2)
train, val = train_test_split(train, test_size=.25)

X_train = train.iloc[:,0]
y_train = train.iloc[:,1]
X_test = test.iloc[:,0]
y_test = test.iloc[:,1]
X_val = val.iloc[:,0]
y_val = val.iloc[:,1]

In [60]:
print("Train: " + str(len(X_train)))
print("Test: " + str(len(X_test)))
print("Val: " + str(len(X_val)))

Train: 600
Test: 200
Val: 200


#### Save Preprocessed Data

In [61]:
if small_dataset:
    np.savez('../data/cnn_dailymail_small.npz', X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, X_val=X_val, y_val=y_val)
else:
    np.savez('../data/cnn_dailymail.npz' , X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, X_val=X_val, y_val=y_val)

#### Example Code for Loading Data

In [34]:
# How to load the data
data = np.load('../data/cnn_dailymail_small.npz', allow_pickle=True)
X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']
X_val = data['X_val']
y_val = data['y_val']
