# Parse Data
1. Read each SGML file in the `data/raw` directory of the S3 bucket.
2. Parse each file to get the text body, title, topics, and other content.
3. Store parsed data as a JSON in the `data/clean` directory of the same S3 bucket.

## Download data

In [1]:
# Unzipped locally
# This could be done in S3 instead

# import tarfile
# import os

# TARFILE_NAME = "Reuters_classification_data.tar.gz"
# FOLDER_NAME = "raw"

# root_directory = os.path.dirname(os.getcwd()) + "/data"
# archive_path = os.path.join(root_directory, TARFILE_NAME) 
# data_path = os.path.join(root_directory, FOLDER_NAME) 

# Unzip tarfile and put individual text files into the "data" folder
# tarfile.open(archive_path, 'r:gz').extractall(data_path)

## Parse each file in the S3 bucket

Read each file in `data/raw/` directory, parse, and store as JSON in `data/clean/` directory.

In [1]:
import boto3
import json

from modules.reuters_parser.reuters_parser import ReutersParser
from constants import BUCKET, DATA_RAW_PREFIX, DATA_CLEAN_PREFIX

In [2]:
# Create s3 client
s3 = boto3.client("s3",
    region_name="us-west-1",
)

In [3]:
# Get sgm files from BUCKET/data/raw...

s3_data_raw_objects = s3.list_objects(
    Bucket=BUCKET,
    Prefix=DATA_RAW_PREFIX,
)["Contents"]
s3_objects = list(map(lambda x: x["Key"], s3_data_raw_objects))

sgm_files = sorted(list(filter(lambda x: ".sgm" in x, s3_objects)))

In [6]:
# # Parse and structure sgm files, storing each doc as an element in a list

# for sgm_file in sgm_files:
#     parser = ReutersParser()
    
#     s3_object = s3.get_object(Bucket=BUCKET, Key=sgm_file)
#     if s3_object["ResponseMetadata"]["HTTPStatusCode"] == 200:
#         response = s3_object["Body"].read()
#         parser.parse(response)
#     doc = list(map(lambda x: x.to_json(), parser.reuters_factory))

#     filename = sgm_file[len(DATA_RAW_PREFIX):sgm_file.find(".sgm")]
#     s3.put_object(
#         Body=bytes(json.dumps(doc).encode('UTF-8')),
#         Bucket=BUCKET,
#         Key=f"{DATA_CLEAN_PREFIX}{filename}.json",
#     )

In [8]:
parser = ReutersParser()

# Read each file
for sgm_file in sgm_files:
    print(f"Reading {sgm_file}...")
    s3_object = s3.get_object(Bucket=BUCKET, Key=sgm_file)
    if s3_object["ResponseMetadata"]["HTTPStatusCode"] == 200:
        response = s3_object["Body"].read()
        parser.parse(response)

# Write entire dataset to S3
doc = list(map(lambda x: x.to_json(), parser.reuters_factory))
filename = "dataset"
s3.put_object(
    Body=bytes(json.dumps(doc).encode('UTF-8')),
    Bucket=BUCKET,
    Key=f"{DATA_CLEAN_PREFIX}{filename}.json",
)

Reading data/raw/reut2-000.sgm...
Reading data/raw/reut2-001.sgm...
Reading data/raw/reut2-002.sgm...
Reading data/raw/reut2-003.sgm...
Reading data/raw/reut2-004.sgm...
Reading data/raw/reut2-005.sgm...
Reading data/raw/reut2-006.sgm...
Reading data/raw/reut2-007.sgm...
Reading data/raw/reut2-008.sgm...
Reading data/raw/reut2-009.sgm...
Reading data/raw/reut2-010.sgm...
Reading data/raw/reut2-011.sgm...
Reading data/raw/reut2-012.sgm...
Reading data/raw/reut2-013.sgm...
Reading data/raw/reut2-014.sgm...
Reading data/raw/reut2-015.sgm...
Reading data/raw/reut2-016.sgm...
Reading data/raw/reut2-017.sgm...
Reading data/raw/reut2-018.sgm...
Reading data/raw/reut2-019.sgm...
Reading data/raw/reut2-020.sgm...
Reading data/raw/reut2-021.sgm...


{'ResponseMetadata': {'RequestId': '3EE3CMGY81TM5AJC',
  'HostId': 'RFvZg8l8OnKpf59TMSh5wRLq757UJwwDTf9vJ53P2APoCQLHqMAw52OulnDqFnusqhsqX/MlOow=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'RFvZg8l8OnKpf59TMSh5wRLq757UJwwDTf9vJ53P2APoCQLHqMAw52OulnDqFnusqhsqX/MlOow=',
   'x-amz-request-id': '3EE3CMGY81TM5AJC',
   'date': 'Tue, 31 Aug 2021 17:15:19 GMT',
   'etag': '"5c08c0e6cab5ceea9b1be26c0f8bd160"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"5c08c0e6cab5ceea9b1be26c0f8bd160"'}