# Parse Data
1. Read each SGML file in the `data/raw` directory of the S3 bucket.
2. Parse each file to get the text body, title, topics, and other content.
3. Store parsed data as a JSON in the `data/clean` directory of the same S3 bucket.

## Download data

In [1]:
# Unzipped locally
# This could be done in S3 instead

# import tarfile
# import os

# TARFILE_NAME = "Reuters_classification_data.tar.gz"
# FOLDER_NAME = "raw"

# root_directory = os.path.dirname(os.getcwd()) + "/data"
# archive_path = os.path.join(root_directory, TARFILE_NAME) 
# data_path = os.path.join(root_directory, FOLDER_NAME) 

# Unzip tarfile and put individual text files into the "data" folder
# tarfile.open(archive_path, 'r:gz').extractall(data_path)

## Import parser module

In [2]:
# Import modules for Jupyter

import sys
from pathlib import Path

# jupyter notebook path
module_path = str(Path.cwd().parents[0] / "modules")
if module_path not in sys.path:
    sys.path.append(module_path)

from reuters_parser.reuters_parser import ReutersParser

## Parse each file in the S3 bucket

Read each file in `data/raw/` directory, parse, and store as JSON in `data/clean/` directory.

In [6]:
import boto3
import json

In [3]:
# Create s3 client
s3 = boto3.client("s3",
    region_name="us-west-1",
)

BUCKET = ""

In [4]:
# Get sgm files from BUCKET/data/raw...

DATA_RAW_PREFIX = "data/raw/"
s3_data_raw_objects = s3.list_objects(
    Bucket=BUCKET,
    Prefix=DATA_RAW_PREFIX,
)["Contents"]

s3_data_raw_objects = list(map(lambda x : x["Key"], s3_data_raw_objects))

sgm_files = sorted(list(filter(lambda x: ".sgm" in x, s3_data_raw_objects)))

In [12]:
# Parse and structure sgm files, storing each doc as an element in a list

parser = ReutersParser()
docs = {}
for sgm_file in sgm_files:
    s3_object = s3.get_object(Bucket=BUCKET, Key=sgm_file)
    if s3_object["ResponseMetadata"]["HTTPStatusCode"] == 200:
        response = s3_object["Body"].read()
        parser.parse(response)
    docs[sgm_file] = list(map(lambda x: x.to_json(), parser.reuters_factory))

In [26]:
DATA_CLEAN_PREFIX = "data/clean/"

for sgm_file, doc in docs.items():
    filename = sgm_file[len(DATA_RAW_PREFIX):sgm_file.find(".sgm")]
    s3.put_object(
        Body=bytes(json.dumps(doc).encode('UTF-8')),
        Bucket=BUCKET,
        Key=f"{DATA_CLEAN_PREFIX}{filename}.json",
    )