<a href="https://colab.research.google.com/github/steph1793/CNN-DailyMail-Bin-To-TFRecords/blob/master/cnn_dailymail_dataset_transform_to_frecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount on your drive to use data stored 

In [0]:
from google.colab import drive
drive.mount("/content/drive")

In [2]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [7]:
cd drive/My\ Drive

/content/drive/My Drive


In [8]:
ls

 [0m[01;34mcnn-dailymail[0m/          [01;34mDatasets[0m/      [01;34mPointer_Generator_Summarizer[0m/
[01;34m'Colab Notebooks'[0m/       [01;34mEvaluator[0m/     [01;34mTokenizer[0m/
[01;34m'Colab Notebooks (1)'[0m/   [01;34mpointer_gen[0m/


## Import dependencies

In [0]:
import ntpath
import os
import glob
import struct
import tensorflow as tf # tensorflow 1.14, 2.0.0-alpha, 2.0.0-beta1
from tensorflow.core.example import example_pb2
import argparse

## Code

Please refer to the file in the Github repo for informations on the code (comments and documentation)
https://github.com/steph1793/CNN-DailyMail-Bin-To-TFRecords

### Example generator

In [0]:
def example_generator(file):
  
  while True:
    len_bytes = file.read(8)
    if not len_bytes: break # finished reading this file
    str_len = struct.unpack('q', len_bytes)[0]
    example_str = struct.unpack('%ds' % str_len, file.read(str_len))[0]
    yield example_pb2.Example.FromString(example_str)

In [0]:

def art_abs_example(article, abstract):
  
  def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
      value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

  feature = {
      'article': _bytes_feature(article),
      'abstract': _bytes_feature(abstract)
  }

  return tf.train.Example(features=tf.train.Features(feature=feature))

### Main method

In [0]:
def make_TFRecords(data_path, new_data_path):
  print("Starting ...")
  if not os.path.exists(new_data_path):
    os.makedirs(new_data_path)
  filelist = glob.glob(data_path+"/**/*.bin", recursive=True) # get the list of datafiles
  assert filelist, "No binary files"
  
  common_path = os.path.commonpath(filelist)
  files = [os.path.splitext(x.replace(common_path, ""))[0]  for x in filelist]
  
  for f, filename in zip(filelist, files):
    try:
      file =  open(f, 'rb')
    except:
      print("Cannot open file : {}".format(f))
      continue
     
    record_file = '{}/{}.tfrecords'.format(new_data_path, filename)
    record_dir = os.path.dirname(record_file)
    if not os.path.exists(record_dir):
      os.makedirs(record_dir)
    with tf.io.TFRecordWriter(record_file) as writer:
      for e in example_generator(file):
        try:
          article_text = e.features.feature['article'].bytes_list.value[0].decode()
          abstract_text = e.features.feature['abstract'].bytes_list.value[0].decode()
          
          tf_example = art_abs_example(article_text, abstract_text)
          writer.write(tf_example.SerializeToString())

        except ValueError:
          tf.logging.error('Failed to get article or abstract from example')
          continue
        if len(article_text) == 0   :
          tf.logging.warning('Found an example with empty article text. Skipping it.')
          
    print("Chunked file {} processed and saved to {}".format(f, record_file))

## Use case

In [0]:
data_path = "pointer_gen/cnn-dailymail/finished_files/test"
new_data_path = "tfrecords_folder"

In [16]:
make_TFRecords(data_path, new_data_path)

Starting ...
Chunked file pointer_gen/cnn-dailymail/finished_files/test/test_001.bin processed and saved to tfrecords_folder//test_001.tfrecords
Chunked file pointer_gen/cnn-dailymail/finished_files/test/test_000.bin processed and saved to tfrecords_folder//test_000.tfrecords
Chunked file pointer_gen/cnn-dailymail/finished_files/test/test2/test_002.bin processed and saved to tfrecords_folder//test2/test_002.tfrecords
Chunked file pointer_gen/cnn-dailymail/finished_files/test/test3/test_003.bin processed and saved to tfrecords_folder//test3/test_003.tfrecords


In [18]:
ls $new_data_path

test_000.tfrecords  test_001.tfrecords  [0m[01;34mtest2[0m/  [01;34mtest3[0m/
