# Labeling and Feature Engineering

In [1]:
import json
import boto3

from constants import BUCKET, DATA_CLEAN_PREFIX, DATA_RAW_PREFIX, DATA_PROCESSED_PREFIX


# Create s3 client
s3 = boto3.client("s3",
    region_name="us-west-1",
)

## Get Y Labels

The `TOPICS` tags in the Reuters corpus contain topic classifiers for the documents.

In [3]:
# Get all possible labels

s3_object = s3.get_object(Bucket=BUCKET, Key=f"{DATA_RAW_PREFIX}all-topics-strings.lc.txt")
if s3_object["ResponseMetadata"]["HTTPStatusCode"] == 200:
    response = s3_object["Body"].read()
    labels = response.decode().strip().split("\n")
    print(f"Retrieved {len(labels)} labels.")

Retrieved 135 labels.


In [4]:
", ".join(labels)

'acq, alum, austdlr, austral, barley, bfr, bop, can, carcass, castor-meal, castor-oil, castorseed, citruspulp, cocoa, coconut, coconut-oil, coffee, copper, copra-cake, corn, corn-oil, cornglutenfeed, cotton , cotton-meal, cotton-oil, cottonseed, cpi, cpu, crude, cruzado, dfl, dkr, dlr, dmk, drachma, earn, escudo, f-cattle, ffr, fishmeal, flaxseed, fuel, gas, gnp, gold, grain, groundnut, groundnut-meal, groundnut-oil, heat, hk, hog, housing, income, instal-debt, interest, inventories, ipi, iron-steel, jet, jobs, l-cattle, lead, lei, lin-meal, lin-oil, linseed, lit, livestock, lumber, lupin, meal-feed, mexpeso, money-fx, money-supply, naphtha     , nat-gas, nickel, nkr, nzdlr, oat, oilseed, orange, palladium, palm-meal, palm-oil, palmkernel, peseta, pet-chem, platinum, plywood, pork-belly, potato, propane, rand, rape-meal, rape-oil, rapeseed, red-bean, reserves, retail, rice, ringgit, rubber, rupiah, rye, saudriyal, sfr, ship, silk, silver, singdlr, skr, sorghum, soy-meal, soy-oil, soybe

## Create Word Tokens and Split Train and Test Data

In [2]:
import numpy as np
import pandas as pd
from modules.preprocessing.clean import body_to_token, generate_bow, vectorize_data

In [6]:
# Get sgm filenames

s3_objects = s3.list_objects(
    Bucket=BUCKET,
    Prefix=DATA_CLEAN_PREFIX,
)["Contents"]
s3_objects = list(map(lambda x: x["Key"], s3_objects))

In [7]:
# Get Y labels, X words, and train/test split

Y = []
X = []
is_train = []
is_test = []


s3_object = s3.get_object(Bucket=BUCKET, Key=f"{DATA_CLEAN_PREFIX}dataset.json")
if s3_object["ResponseMetadata"]["HTTPStatusCode"] == 200:
    response = s3_object["Body"].read()
    all_entries = json.loads(response.decode())  # list of entries

    for entry in all_entries:
        
        if entry["split"] == "not-used":
            next

        # Tokenize text body (Note: takes a while to run)
        X.append(body_to_token(entry["body"]))

        # Label Y
        Y.append([int(label in entry["topics"]) for label in labels])

        # Determine how to split data: train, test, or not-used
        is_train += [1 if entry["split"] == "train" else 0]
        is_test += [1 if entry["split"] == "test" else 0]


In [233]:
# Split to train and test data

np_X = np.array(X, dtype=object)
np_Y = np.array(Y, dtype=object)


tbl = np.vstack((is_train, np_X))  # shape: 2, 21578
train_X = tbl[1, tbl[0,:] == 1]  # shape: 14668, 1

tbl = np.vstack((is_test, np_X))  # shape: 2, 21578
test_X = tbl[1, tbl[0,:] == 1]  # shape: 6188, 1

tbl = np.vstack((is_train, np_Y.T))  # shape: 136, 21578
train_Y = tbl[1:, tbl[0,:] == 1].T  # shape: 14668, 135

tbl = np.vstack((is_test, np_Y.T))  # shape: 136, 21578
test_Y = tbl[1:, tbl[0,:] == 1].T  # shape: 6188, 135

In [281]:
# Write to file

with open("data/pretrain_X.txt", "w") as file:
    json.dump(train_X.tolist(), file)
with open("data/pretest_X.txt", "w") as file:
    json.dump(test_X.tolist(), file)
with open("data/pretrain_Y.txt", "w") as file:
    json.dump(train_Y.tolist(), file)
with open("data/pretest_Y.txt", "w") as file:
    json.dump(test_Y.tolist(), file)

In [3]:
# Open file

with open("data/pretrain_X.txt", "r") as file:
    train_X = json.load(file)
with open("data/pretest_X.txt", "r") as file:
    test_X = json.load(file)
with open("data/pretrain_Y.txt", "r") as file:
    train_Y = json.load(file)
with open("data/pretest_Y.txt", "r") as file:
    test_Y = json.load(file)

## Explore Word Similarity

Try: Use `nn.Embedding` in PyTorch instead of Word2Vec.

In [9]:
# import gensim

# # Create continuous bag of words based on training data
# model = gensim.models.Word2Vec(train_X, min_count=1)

# print(f"Similarity of 'corn' and 'crop' is {model.wv.similarity("corn", "crop")}.")

# print("Most similar words...")
# model.wv.most_similar("share")[:5]

## Use Bag of Words to Create Features

In [4]:
freq_counter = generate_bow(train_X)
lengths = list(map(lambda x: len(x), train_X))
print(f"Out of {len(train_X)} documents, the shortest has {min(lengths)} words and the longest has {max(lengths)} words.")


        The most common words from 23196 documents are 
        said (35764), mln (17106), dlr (15886), reuter (13466), pct (12022).
    
Truncated word dictionary to 10000 words.
Out of 14668 documents, the shortest has 0 words and the longest has 820 words.


In [5]:
# Create encoding
word_encoding = {word: index for index, word in enumerate(freq_counter.keys(), 2)}

In [7]:
# Apply on both train and test dataset
train_X = vectorize_data(word_encoding, train_X)
test_X = vectorize_data(word_encoding, test_X)

In [9]:
# Convert all to DataFrame
train_X = pd.DataFrame(train_X)
train_Y = pd.DataFrame(train_Y)
test_X = pd.DataFrame(test_X)
test_Y = pd.DataFrame(test_Y)

In [10]:
# Filter out training data where there are no words
no_0_words = np.array(train_X.sum(axis=1) > 0)
train_X = train_X[no_0_words]
train_Y = train_Y[no_0_words]

In [13]:
# Save as csv
train_X.to_csv("data/train_X.csv", index=False, header=False)
train_Y.to_csv("data/train_Y.csv", index=False, header=False)
test_X.to_csv("data/test_X.csv", index=False, header=False)
test_Y.to_csv("data/test_Y.csv", index=False, header=False)

In [14]:
# Upload to S3
s3 = boto3.resource('s3')
s3.meta.client.upload_file('./data/train_X.csv', BUCKET, f"{DATA_PROCESSED_PREFIX}train_X.csv")
s3.meta.client.upload_file('./data/train_Y.csv', BUCKET, f"{DATA_PROCESSED_PREFIX}train_Y.csv")
s3.meta.client.upload_file('./data/test_X.csv', BUCKET, f"{DATA_PROCESSED_PREFIX}test_X.csv")
s3.meta.client.upload_file('./data/test_Y.csv', BUCKET, f"{DATA_PROCESSED_PREFIX}test_Y.csv")