In [1]:
from tqdm import tqdm

import json 
# import pandas as pd
# import polars as pl
import unicodedata

import polars as pl
import torch
# import torch.nn as nn
# import torch.optim as optim
from torch.utils.data import Dataset # , DataLoader
import numpy as np
# from sentence_transformers import SentenceTransformer

In [2]:
path = "../data/0_raw/yelp-dataset/versions/4/"

In [3]:
path_reviews = path+"yelp_academic_dataset_review.json"
# path_businesses = path+"yelp_academic_dataset_business.json"
# path_tips = path+"yelp_academic_dataset_tip.json"
# path_checkins = path+"yelp_academic_dataset_checkin.json"
path_users = path+"yelp_academic_dataset_user.json"

In [4]:
def load_json_to_dataframe(json_path, max_to_import=999_999_999):
    """
    Load a JSON file with line-delimited JSON objects into a Polars DataFrame, 
    with an option to limit the number of rows imported. Convert all spaces to 
    proper spaces and ensure no NBSP remain.
    """

    def clean_line(line):
        # Normalize unicode to ensure consistency in whitespace representations
        line = unicodedata.normalize("NFKC", line)
        
        # Replace various forms of non-breaking spaces and related entities with a normal space
        # '\u00a0' is the standard NBSP unicode character
        # '&nbsp;' is an HTML entity that may appear
        # We'll also remove literal 'NBSP' if present as text.
        line = line.replace("\u00a0", " ")
        line = line.replace("\xa0", " ")  # Sometimes NBSP is represented like this
        line = line.replace("&nbsp;", " ")
        line = line.replace("NBSP", " ")
        
        return line

    # Read and clean each line before parsing
    data = []
    errors = 0
    with open(json_path, 'r', encoding='utf-8') as data_file:
        # for line in data_file:
        for line in tqdm(data_file, desc="Processing lines"):
            try:
                clean_data = json.loads(clean_line(line))
            except:
                # print(line)
                errors += 1
            data.append(clean_data)
            if len(data) >= max_to_import:
                break

    df = pl.DataFrame(data)
    print(f"Loaded: {df.shape[0]:,} rows, {df.shape[1]:,} columns. Excluded {errors} many errors")

    # Additional safety checks: replace any NBSP remaining in the DataFrame itself
    # Just in case something slipped through.
    # We'll apply a replacement to all string columns.
    string_cols = [c for c, dt in zip(df.columns, df.dtypes) if dt in (pl.Utf8, pl.Object)]
    for col in string_cols:
        # Replace NBSP and HTML entities again at DataFrame level
        df = df.with_columns(
            pl.col(col).str.replace("\u00a0", " ")
                       .str.replace("\xa0", " ")
                       .str.replace("&nbsp;", " ")
                       .str.replace("NBSP", " ")
        )
    
    # Double check for NBSP characters in the text field (if it exists)
    if "text" in df.columns:
        # Convert to Python strings and check
        sample_text = df["text"].head().to_list()
        
        # Check if NBSP still present
        nbsp_found = any("\u00a0" in t or "&nbsp;" in t for t in sample_text if isinstance(t, str))
        
        if nbsp_found:
            print("Warning: NBSP characters found in sample after cleanup!")
        else:
            print("No NBSP found in sample text after cleanup.")

    return df

In [5]:
reviews = load_json_to_dataframe(path_reviews, 1_000_000)

Processing lines: 999999it [00:03, 293769.47it/s]


Loaded: 1,000,000 rows, 9 columns. Excluded 0 many errors
No NBSP found in sample text after cleanup.


In [6]:
df = reviews.select(pl.col("text", "stars"))
df = df.with_columns(pl.col("stars").cast(pl.Int8))

In [7]:
train_set_proportion = 0.85
test_set_proportion = 0.149
validation_set_proportion = 0.001
n = df.shape[0]

# Define the probabilities for each split
split_probabilities = [train_set_proportion, test_set_proportion, validation_set_proportion]

# Assign split labels
train_test_validation_split = np.random.choice(
    [1, 2, 3],  # 1: train, 2: test, 3: validation
    size=n,
    p=split_probabilities
)

# Add split column to dataframe
df = df.with_columns(pl.Series(train_test_validation_split).alias("split"))

# Create separate datasets
df_train = df.filter(pl.col("split") == 1).select(pl.col("text", "stars"))
df_test = df.filter(pl.col("split") == 2).select(pl.col("text", "stars"))
df_validation = df.filter(pl.col("split") == 3).select(pl.col("text", "stars"))

# Save datasets to CSV
df_train.write_csv("../data/1_train_test_split/df_train.csv")
df_test.write_csv("../data/1_train_test_split/df_test.csv")
df_validation.write_csv("../data/1_train_test_split/df_validation.csv")
