# Preprocessing

In [None]:
from preprocessing import *

### Define vars

In [None]:
sample_size = 100
seed = 42

# Load data from either cached local files or remote files

In [None]:
def get_all_data(sample_size):
    # cache hit
    cachepath = dataset_path / f"cache_all_{sample_size}.csv"
    if cachepath.exists():
        data = pd.read_csv(cachepath)
        print(f"total data size: {data.memory_usage(deep=True).sum() / 1e9:.2f} gb")
        return data

    # cache miss
    data = pd.DataFrame()
    categories = get_all_categories()
    for category in tqdm(categories, desc="loading all data", ncols=100):
        category_data = get_category_data(category, sample_size)
        data = pd.concat([data, category_data], ignore_index=True)
        tqdm.write(
            f"loaded {category} - category size: {category_data.memory_usage(deep=True).sum() / 1e9:.2f} gb, total size: {data.memory_usage(deep=True).sum() / 1e9:.2f} gb")
    data.to_csv(cachepath, index=False)
    print(f"total data size: {data.memory_usage(deep=True).sum() / 1e9:.2f} gb")
    return data

In [None]:
df = get_all_data(sample_size=sample_size)

Preprocess by removing unnecessary columns
Cleanup string columns.
Remove nan rows.


In [None]:
def preprocess(df):
    df = df.copy()

    df.drop(columns=["images", "asin", "parent_asin", "user_id"], inplace=True, errors="ignore")

    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")

    df = df.dropna(subset=["text", "title", "rating"])
    df["text"] = df["text"].str.replace(r"<.*?>", "", regex=True)  # drop html tags
    df["title"] = df["title"].str.replace(r"<.*?>", "", regex=True)
    df["text"] = df["text"].str.strip()
    df["title"] = df["title"].str.strip()
    df = df[df["text"].str.len() > 0]
    df = df[df["title"].str.len() > 0]
    return df


In [None]:
df = preprocess(df)

In [None]:
def add_inferences(df, sample_size):
    results_path = dataset_path / f"results_n{sample_size}.csv"
    print(f"results_path: {results_path}")
    if not results_path.exists():
        tqdm.pandas()

        def process_row(row):
            review = f"{row['title']}: {row['text']}"
            sentiment, score = get_sentiment(review)
            return {
                "language": get_language(review),
                "sentiment": sentiment,
                "sentiment_score": score,
                "subjectivity_score": get_subjectivity(review),
                "aspects": get_aspects(review),
                "rating": get_rating(review),
            }

        results = df.progress_apply(process_row, axis=1)
        results.to_csv(results_path, index=False)
    else:
        results = pd.read_csv(results_path)

    df = df.copy()
    results = results["0"].apply(lambda x: pd.Series(eval(x)))
    results = results.rename(columns={"rating": "predicted_rating"})
    df = pd.concat([df, results], axis=1)
    return df

In [None]:
df = add_inferences(df, sample_size=sample_size)
df.head()

In [None]:

# save data
df.to_csv(data_path / "data.csv", index=False)
print("saved data")