In [1]:
import polars as pl

In [3]:
path = "/home/richardarcher/.cache/kagglehub/datasets/yelp-dataset/yelp-dataset/versions/4/"

In [4]:
path_reviews = path+"yelp_academic_dataset_review.json"
# path_businesses = path+"yelp_academic_dataset_business.json"
# path_tips = path+"yelp_academic_dataset_tip.json"
# path_checkins = path+"yelp_academic_dataset_checkin.json"
path_users = path+"yelp_academic_dataset_user.json"

In [5]:
def load_json_to_dataframe(json_path, max_to_import=999_999_999):
    """
    Load a JSON file with line-delimited JSON objects into a Polars DataFrame, 
    with an option to limit the number of rows imported. Convert all spaces to 
    proper spaces and ensure no NBSP remain.
    """

    def clean_line(line):
        # Normalize unicode to ensure consistency in whitespace representations
        line = unicodedata.normalize("NFKC", line)
        
        # Replace various forms of non-breaking spaces and related entities with a normal space
        # '\u00a0' is the standard NBSP unicode character
        # '&nbsp;' is an HTML entity that may appear
        # We'll also remove literal 'NBSP' if present as text.
        line = line.replace("\u00a0", " ")
        line = line.replace("\xa0", " ")  # Sometimes NBSP is represented like this
        line = line.replace("&nbsp;", " ")
        line = line.replace("NBSP", " ")
        
        return line

    # Read and clean each line before parsing
    data = []
    errors = 0
    with open(json_path, 'r', encoding='utf-8') as data_file:
        for line in data_file:
            try:
                clean_data = json.loads(clean_line(line))
            except:
                # print(line)
                errors += 1
            data.append(clean_data)
            if len(data) >= max_to_import:
                break

    df = pl.DataFrame(data)
    print(f"Loaded: {df.shape[0]:,} rows, {df.shape[1]:,} columns. Excluded {errors} many errors")

    # Additional safety checks: replace any NBSP remaining in the DataFrame itself
    # Just in case something slipped through.
    # We'll apply a replacement to all string columns.
    string_cols = [c for c, dt in zip(df.columns, df.dtypes) if dt in (pl.Utf8, pl.Object)]
    for col in string_cols:
        # Replace NBSP and HTML entities again at DataFrame level
        df = df.with_columns(
            pl.col(col).str.replace("\u00a0", " ")
                       .str.replace("\xa0", " ")
                       .str.replace("&nbsp;", " ")
                       .str.replace("NBSP", " ")
        )
    
    # Double check for NBSP characters in the text field (if it exists)
    if "text" in df.columns:
        # Convert to Python strings and check
        sample_text = df["text"].head().to_list()
        
        # Check if NBSP still present
        nbsp_found = any("\u00a0" in t or "&nbsp;" in t for t in sample_text if isinstance(t, str))
        
        if nbsp_found:
            print("Warning: NBSP characters found in sample after cleanup!")
        else:
            print("No NBSP found in sample text after cleanup.")

    return df

In [6]:
reviews = load_json_to_dataframe(path_reviews, 1_000_000)
# businesses = load_json_to_dataframe(path_businesses, 9_999)
# tips = load_json_to_dataframe(path_tips, 9_999)
# checkins = load_json_to_dataframe(path_checkins, 9_999)

Loaded: 1,000,000 rows, 9 columns. Excluded 0 many errors
No NBSP found in sample text after cleanup.


In [7]:
reviews.head()

review_id,user_id,business_id,stars,useful,funny,cool,text,date
str,str,str,f64,i64,i64,i64,str,str
"""KU_O5udG6zpxOg-VcAEodg""","""mh_-eMZ6K5RLWhZyISBhwA""","""XQfwVwDr-v0ZS3_CbbE5Xw""",3.0,0,0,0,"""If you decide to eat here, jus…","""2018-07-07 22:09:11"""
"""BiTunyQ73aT9WBnpR9DZGw""","""OyoGAe7OKpv6SyGZT5g77Q""","""7ATYjTIgM3jUlt4UM3IypQ""",5.0,1,0,1,"""I've taken a lot of spin class…","""2012-01-03 15:28:18"""
"""saUsX_uimxRlCVr67Z4Jig""","""8g_iMtfSiwikVnbP2etR0A""","""YjUWPpI6HXG530lwP-fb2A""",3.0,0,0,0,"""Family diner. Had the buffet. …","""2014-02-05 20:30:30"""
"""AqPFMleE6RsU23_auESxiA""","""_7bHUi9Uuf5__HHc_Q8guQ""","""kxX2SOes4o-D3ZQBkiMRfA""",5.0,1,0,1,"""Wow! Yummy, different, delic…","""2015-01-04 00:01:03"""
"""Sx8TMOWLNuJBWer-0pcmoA""","""bcjbaE6dDog4jkNY91ncLQ""","""e4Vwtrqf-wpJfwesgvdgxQ""",4.0,1,0,1,"""Cute interior and owner (?) ga…","""2017-01-14 20:54:15"""


In [None]:
users = load_json_to_dataframe(path_users, 9_999)

In [None]:
# businesses.head(2)

In [None]:
# tips.head(2)

In [None]:
# checkins.head(2)

In [None]:
users.head(2)

## Prediction with embeddings


In [8]:
df = reviews.select(pl.col("text", "stars"))
df = df.with_columns(pl.col("stars").cast(pl.Int64))

In [9]:
df.head()

text,stars
str,i64
"""If you decide to eat here, jus…",3
"""I've taken a lot of spin class…",5
"""Family diner. Had the buffet. …",3
"""Wow! Yummy, different, delic…",5
"""Cute interior and owner (?) ga…",4


In [None]:
# unicode sanity checks

# df["text"][3]

# for idx, val in enumerate(df["text"].head().to_list()):
#     if "\u00a0" in val or "&nbsp;" in val:
#         print(f"NBSP still found in row {idx}: {repr(val)}")
#     else:
#         print(f"No NBSP in row {idx}.")

# text_value = df["text"][3]
# for idx, char in enumerate(text_value):
#     print(idx, char, ord(char), f"\\u{ord(char):04x}")

In [10]:
df.group_by(pl.col("stars")).len()

stars,len
i64,u32
2,77912
4,221897
3,102954
1,138625
5,458612


Batches: 100%|██████████| 31250/31250 [03:33<00:00, 146.18it/s]


In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# model def 
class SimpleRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(SimpleRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 5)  # 5 outputs for probabilities
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)  # logits
        return out

input_dim = embeddings.shape[1]
model = SimpleRegressor(input_dim)
model = model.to("cuda")  # or "cuda" if available

In [None]:
# loss
def emd_loss(y_pred, y_true):
    # y_pred: (batch, 5) logits
    # y_true: (batch, 5) one-hot vectors
    p = torch.softmax(y_pred, dim=1)
    P = torch.cumsum(p, dim=1)
    Q = torch.cumsum(y_true, dim=1)
    emd = torch.sum(torch.abs(P - Q), dim=1)  # sum over the five dimensions
    return torch.mean(emd)

In [None]:
# # train
# optimizer = optim.Adam(model.parameters(), lr=1e-3)
# num_epochs = 5  # for demonstration, adjust as needed
# 
# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0.0
#     for batch_x, batch_y in train_loader:
#         optimizer.zero_grad()
#         pred = model(batch_x)
#         loss = emd_loss(pred, batch_y)
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item() * len(batch_x)
#     train_loss /= len(train_loader.dataset)
#     
#     # Evaluate on test set
#     model.eval()
#     test_loss = 0.0
#     with torch.no_grad():
#         for batch_x, batch_y in test_loader:
#             pred = model(batch_x)
#             loss = emd_loss(pred, batch_y)
#             test_loss += loss.item() * len(batch_x)
#     test_loss /= len(test_loader.dataset)
#     
#     print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}")

In [None]:
# train
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 5  # for demonstration, adjust as needed

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        
        batch_x = batch_x.to("cuda")
        batch_y = batch_y.to("cuda")
        
        pred = model(batch_x)
        loss = emd_loss(pred, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(batch_x)
    train_loss /= len(train_loader.dataset)
    
    # Evaluate on test set
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            pred = model(batch_x)
            loss = emd_loss(pred, batch_y)
            test_loss += loss.item() * len(batch_x)
    
            # Compute hit rate
            # y_pred_classes: argmax of predictions
            # y_true_classes: argmax of one-hot labels
            y_pred_classes = torch.argmax(torch.softmax(pred, dim=1), dim=1)
            y_true_classes = torch.argmax(batch_y, dim=1)
    
            correct += (y_pred_classes == y_true_classes).sum().item()
            total += len(batch_x)
    
    test_loss /= len(test_loader.dataset)
    hit_rate = correct / total
    
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f} | Hit Rate: {hit_rate:.4f}")    

In [None]:
# eval
new_texts = ["This product was great!", "Not what I expected..."]
new_embeddings = embedder.encode(new_texts, convert_to_numpy=True)
new_embeddings_t = torch.tensor(new_embeddings, dtype=torch.float32)
with torch.no_grad():
    logits = model(new_embeddings_t)
    probs = torch.softmax(logits, dim=1)
    predicted_stars = torch.argmax(probs, dim=1) + 1
    print("Predicted stars:", predicted_stars.tolist())