# Labeling and Feature Engineering

In [1]:
import copy
import json
import numpy as np
import pandas as pd

from modules.preprocessing.tokenizer import body_to_token
from modules.preprocessing.vectorizer import generate_bow, vectorize_data
from modules.utils.s3 import get_from_s3, put_to_s3
from constants import BUCKET, DATA_CLEAN_PREFIX, DATA_RAW_PREFIX, DATA_PROCESSED_PREFIX

## Get Y Labels

The `TOPICS` tags in the Reuters corpus contain topic classifiers for the documents.

In [2]:
# Get all possible labels
labels = get_from_s3(BUCKET, f"{DATA_RAW_PREFIX}all-topics-strings.lc.txt")
labels = list(map(lambda x: x.strip(), labels.strip().split("\n")))
print(f"Retrieved {len(labels)} labels.")
", ".join(labels)

Retrieved 135 labels.


'acq, alum, austdlr, austral, barley, bfr, bop, can, carcass, castor-meal, castor-oil, castorseed, citruspulp, cocoa, coconut, coconut-oil, coffee, copper, copra-cake, corn, corn-oil, cornglutenfeed, cotton, cotton-meal, cotton-oil, cottonseed, cpi, cpu, crude, cruzado, dfl, dkr, dlr, dmk, drachma, earn, escudo, f-cattle, ffr, fishmeal, flaxseed, fuel, gas, gnp, gold, grain, groundnut, groundnut-meal, groundnut-oil, heat, hk, hog, housing, income, instal-debt, interest, inventories, ipi, iron-steel, jet, jobs, l-cattle, lead, lei, lin-meal, lin-oil, linseed, lit, livestock, lumber, lupin, meal-feed, mexpeso, money-fx, money-supply, naphtha, nat-gas, nickel, nkr, nzdlr, oat, oilseed, orange, palladium, palm-meal, palm-oil, palmkernel, peseta, pet-chem, platinum, plywood, pork-belly, potato, propane, rand, rape-meal, rape-oil, rapeseed, red-bean, reserves, retail, rice, ringgit, rubber, rupiah, rye, saudriyal, sfr, ship, silk, silver, singdlr, skr, sorghum, soy-meal, soy-oil, soybean, st

## Create Word Tokens and Split Train and Test Data

In [None]:
# Get Y labels, X words, and train/test split

# Create data container
template = {"X": [], "Y": [], "ids": []}
data = {"train": copy.deepcopy(template), "test": copy.deepcopy(template)}

# Get entries from S3
response = get_from_s3(BUCKET, f"{DATA_CLEAN_PREFIX}dataset.json")
all_entries = json.loads(response)

for index, entry in enumerate(all_entries, 1):
    if index % 1000 == 0:
        print("Reading entry #{}...".format(index))

    # Drop record if not used in train/test dataset
    if entry["split"] == "not-used" or entry["body"] == "":
        continue

    # Tokenize text body (Note: takes a while to run)
    token = body_to_token(entry["body"])
    
    # Drop record if there is no text body after vectorization
    if len(token) == 0:
        continue
    
    # Get training labels
    y_labels = [int(label.strip() in entry["topics"]) for label in labels]
    
    # Add to data
    train_or_test = entry["split"]
    data[train_or_test]["X"].append(token)
    data[train_or_test]["Y"].append(y_labels)
    data[train_or_test]["ids"].append(entry["id"])

Reading entry #1000...


In [None]:
# Write to file locally

with open("data/pretrain_X.json", "w") as file:
    json.dump(data["train"]["X"], file)
with open("data/pretest_X.json", "w") as file:
    json.dump(data["test"]["X"], file)

with open("data/pretrain_Y.json", "w") as file:
    json.dump(data["train"]["Y"], file)
with open("data/pretest_Y.json", "w") as file:
    json.dump(data["test"]["Y"], file)

with open("data/pretrain_ids.json", "w") as file:
    json.dump(data["train"]["ids"], file)
with open("data/pretest_ids.json", "w") as file:
    json.dump(data["test"]["ids"], file)

In [None]:
# Upload to S3

put_to_s3(json.dumps(data["train"]["X"]), BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/train_X.json")
put_to_s3(json.dumps(data["train"]["Y"]), BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/train_Y.json")
put_to_s3(json.dumps(data["train"]["ids"]), BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/train_ids.json")

put_to_s3(json.dumps(data["test"]["X"]), BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/test_X.json")
put_to_s3(json.dumps(data["test"]["Y"]), BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/test_Y.json")
put_to_s3(json.dumps(data["test"]["ids"]), BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/test_ids.json")

## Use Bag of Words to Create Features

In [None]:
# Load data
train_X = json.loads(get_from_s3(BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/train_X.json"))
train_Y = json.loads(get_from_s3(BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/train_Y.json"))

test_X = json.loads(get_from_s3(BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/test_X.json"))
test_Y = json.loads(get_from_s3(BUCKET, f"{DATA_PROCESSED_PREFIX}vectorized/test_Y.json"))

In [None]:
freq_counter = generate_bow(train_X)
lengths = list(map(lambda x: len(x), train_X))
print(f"Out of {len(train_X)} documents, the shortest has {min(lengths)} words and the longest has {max(lengths)} words.")

In [None]:
# Create indexer to encode words
word_encoding = {word: index for index, word in enumerate(freq_counter.keys(), 2)}

In [None]:
# Apply on both train and test dataset
train_X = vectorize_data(word_encoding, train_X)
test_X = vectorize_data(word_encoding, test_X)

In [None]:
# Convert all to DataFrame
train_X = pd.DataFrame(train_X)
train_Y = pd.DataFrame(train_Y)
test_X = pd.DataFrame(test_X)
test_Y = pd.DataFrame(test_Y)

In [None]:
# Save as csv
train_X.to_csv("data/train_X.csv", index=False, header=False)
train_Y.to_csv("data/train_Y.csv", index=False, header=False)
test_X.to_csv("data/test_X.csv", index=False, header=False)
test_Y.to_csv("data/test_Y.csv", index=False, header=False)

put_to_s3(train_X.to_csv(index=False, header=False), BUCKET, f"{DATA_PROCESSED_PREFIX}train_X.csv")
put_to_s3(train_Y.to_csv(index=False, header=False), BUCKET, f"{DATA_PROCESSED_PREFIX}train_Y.csv")
put_to_s3(test_X.to_csv(index=False, header=False), BUCKET, f"{DATA_PROCESSED_PREFIX}test_X.csv")
put_to_s3(test_Y.to_csv(index=False, header=False), BUCKET, f"{DATA_PROCESSED_PREFIX}test_Y.csv")