# CommLit: Raw Data Setup

In [1]:
# import AWS packages
aws = True
if aws:
    import boto3

# import regular packages
import os
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import pandas as pd
import spacy

# set environment
import commlit as cl
cl.env_config("config.json")
comp_dir = os.path.join(os.environ.get("DATA_DIR"))

Value for DATA_DIR has been set!


## Setup

In [2]:
# NLP model
nlp = spacy.load("en_core_web_sm")
tag_df = cl.gen_tag_df(nlp.pipe_labels['tagger'])

# Word frequencies
freq_df = pd.read_csv(os.path.join(comp_dir, "google-books-common-words.txt"),
                      delimiter="\t", header=None)
freq_df.columns = ["word", "count"]
freq_df.loc[:, "word"] = freq_df["word"].str.lower()

# Training data
train_df = pd.read_csv(os.path.join(comp_dir, "train.csv"))
train_df = train_df[["id", "excerpt", "target", "standard_error"]]
train_df.sample(3)

Unnamed: 0,id,excerpt,target,standard_error
826,90d8170d5,Kwesi's parents were Papa and Maame. Maame alw...,-1.061225,0.473119
555,916faaa6b,The respiratory system (called also respirator...,-1.474937,0.492069
2056,823f90bf4,"There were no mice for kitty, and what could s...",0.666116,0.532948


## Word Feature Extraction

In [3]:
# set up tuples to iterate through and empty list for processed data collection
doc_tups = list(train_df[["excerpt","id"]].itertuples(index=False, name=None))
df_list = []

# iterate through doc_tups
for doc, i in tqdm(nlp.pipe(doc_tups, as_tuples=True)):
    
    # process word features and append
    token_df = cl.gen_raw_word_features(doc, tag_df, freq_df)
    token_df.loc[:, "id"] = i
    df_list.append(token_df)

# compile word feature data
df = pd.concat(df_list, ignore_index=True)
df.shape

2834it [06:39,  7.10it/s]


(573290, 151)

In [4]:
df.to_parquet(os.path.join(comp_dir, "raw_feats.parquet"))