In [121]:
# DataFrame manipulation
import pandas as pd
# Writing to parquet
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Sequence

def save_to_parquet(
    data: pd.DataFrame, cols: Sequence[str], loc: str, filename: str
) -> None:
    """Save the processed dataframes into parquet files.

    Args:
        data (pd.DataFrame): Input Pandas DataFrame.
        cols (Sequence[str]): Column names.
        loc (str): Folder location.
        filename (str): Filename.
    """
    schema = pa.schema({val: pa.string() for val in cols})
    table = pa.Table.from_pandas(data, schema=schema)
    pq.write_table(
        table, where=f"{loc}{filename}.parquet", compression="snappy",
    )

# Load Data And Pre-Process Columns

In [2]:
# Prepping dataframe
df = pd.read_csv("../data/complaints.csv")

# Use same year as comparison
df = df[df["Date received"].str.contains("2020")]
df.loc[:, "State"] = df["State"].fillna("ZZ")

df = df.rename(
    columns={
        "Product": "label",
        "Consumer complaint narrative": "narrative",
        "Issue": "issue",
        "Sub-issue": "sub_issue",
        "Company": "company",
        "Company public response": "company_response",
        "Tags": "tags",
        "State": "state",
        "Complaint ID": "id",
    }
)
df.loc[:, "narrative_na"] = df["narrative"].isna()
df = df.reset_index(drop=True)
df.loc[:, "node_id"] = df.index
df = df[
    [
        "id",
        "node_id",
        "label",
        "narrative",
        "issue",
        "sub_issue",
        "company",
        "company_response",
        "tags",
        "state",
        "narrative_na",
    ]
]
df["label"].replace(
    {
        "Credit reporting, credit repair services, or other personal consumer reports": "credit_reporting",
        "Debt collection": "debt_collection",
        "Credit reporting": "credit_reporting",
        "Credit card or prepaid card": "credit_card",
        "Prepaid card": "credit_card",
        "Credit card": "credit_card",
        "Mortgage": "mortgages_and_loans",
        "Checking or savings account": "retail_banking",
        "Money transfer, virtual currency, or money service": "retail_banking",
        "Money transfers": "retail_banking",
        "Virtual currency": "retail_banking",
        "Bank account or service": "retail_banking",
        "Vehicle loan or lease": "mortgages_and_loans",
        "Payday loan, title loan, or personal loan": "mortgages_and_loans",
        "Consumer Loan": "mortgages_and_loans",
        "Payday loan": "mortgages_and_loans",
        "Student loan": "mortgages_and_loans",
    },
    inplace=True,
)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
# Combine into one sequence column
sequence_cols = [
    "narrative",
    "issue",
    "sub_issue",
    "company",
    "company_response",
    "tags",
]
df.loc[:, sequence_cols] = df[sequence_cols].fillna("")
df.loc[:, "sequence"] = (
    df["narrative"]
    + " "
    + df["issue"]
    + " "
    + df["sub_issue"]
    + " "
    + df["company"]
    + " "
    + df["company_response"]
    + " "
    + df["tags"]
)
data = df[["id", "node_id", "label", "sequence", "state", "narrative_na"]].astype("str")

### Save to Parquet

In [None]:
save_to_parquet(
    data=processed_df,
    cols=processed_df.columns.tolist(),
    loc="../data/",
    filename="processed",
)