# Parse Data
1. Read each SGML file in the `data/raw` directory of the S3 bucket.
2. Parse each file to get the text body, title, topics, and other content.
3. Store parsed data as a JSON in the `data/clean` directory of the same S3 bucket.

## Download data

In [1]:
# Unzipped locally
# This could be done in S3 instead

# import tarfile
# import os

# TARFILE_NAME = "Reuters_classification_data.tar.gz"
# FOLDER_NAME = "raw"

# root_directory = os.path.dirname(os.getcwd()) + "/data"
# archive_path = os.path.join(root_directory, TARFILE_NAME) 
# data_path = os.path.join(root_directory, FOLDER_NAME) 

# Unzip tarfile and put individual text files into the "data" folder
# tarfile.open(archive_path, 'r:gz').extractall(data_path)

## Parse each file in the S3 bucket

Read each file in `data/raw/` directory, parse, and store as JSON in `data/clean/` directory.

In [2]:
import boto3
import json

from modules.reuters_parser.reuters_parser import ReutersParser
from modules.utils.s3 import list_s3, get_from_s3, put_to_s3
from constants import BUCKET, DATA_RAW_PREFIX, DATA_CLEAN_PREFIX

In [4]:
# Get sgm files from BUCKET/data/raw
s3_objects = list_s3(BUCKET, DATA_RAW_PREFIX)
sgm_files = sorted(list(filter(lambda x: ".sgm" in x, s3_objects)))

# Parse and structure sgm files, storing each doc as an element in a list
for sgm_file in sgm_files:
    parser = ReutersParser()
    
    # Parse file from S3
    response = get_from_s3(BUCKET, sgm_file, decode=False)
    parser.parse(response)
    doc = list(map(lambda x: x.to_json(), parser.reuters_factory))

    # Write back to S3
    filename = sgm_file[len(DATA_RAW_PREFIX):sgm_file.find(".sgm")]
    put_to_s3(json.dumps(doc), BUCKET, f"{DATA_CLEAN_PREFIX}{filename}.json")

Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-000.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-001.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-002.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-003.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-004.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-005.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-006.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-007.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-008.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-009.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-010.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-011.json
Uploaded file to s3://aiforallcapstone-crimson/data/clean/reut2-012.json
Uploaded file to s3://aiforallcapstone-crimson/data