## Setup
From a large dataset covering the complete review history of all apps that have been listed at least once in the top-100 overall or category-specific rankings, we randomly sampled 4,000 reviews written in English.
To establish ground truth for the sample pool, a member of the research team and a research assistant independently coded the reviews using a coding scheme developed through inductive coding by our team. This scheme comprises nine content dimensions (detailed in Section 4.2.2). A subset of n=1,000 review descriptions was independently coded by both raters to assess inter-rater reliability (observed agreement = 0.89, macro-average κ = 0.77), and any discrepancies were discussed.
We generated two distinct random subsamples from the labeled pool of n=4,000 reviews: a validation set of n=1,000 reviews serving as a holdout, and a training set of n=2,000 reviews. Both subsamples were constructed to reflect a representative class distribution of the overall pool. The remaining n=1,000 reviews of the labeled pool serves to back additional analyses to explore variations in sample sizes and class distributions.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import os
import json
import csv
import glob
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from collections import defaultdict

#### Global Paths, Directories and Variables

In [None]:
# Define Demo Study path
DEMO_PATH = os.path.abspath(os.path.join(".."))

# Define relevant paths 
DATA_DIR = os.path.join(DEMO_PATH, 'training_validation_data')
LLM_API_FOLDER = os.path.join(DEMO_PATH, 'LLM_API')

# Reproducibility
RANDOM_STATE = 94032

In [None]:
# Helper function for class distribution
def print_dist(obj, name, classes=None):
    """
    Print the count and percentage distribution of 'label' for the given DataFrame:
        - If Series: interpreted as flat label list (e.g., [0, 2, 2, 5, ...])
        - If ndarray: expected shape (n_examples, n_classes) with binary 0/1 entries (multi-label)
    """
    if isinstance(obj, np.ndarray):
        # Case: multi-label hot matrix (e.g., 2D [0/1] array)
        if classes is None:
            raise ValueError("print_dist: must pass classes for ndarray input")
        counts = obj.sum(axis=0)
        pct    = (counts / obj.shape[0] * 100).round(2)
        df     = pd.DataFrame({'count': counts, 'pct': pct}, index=classes)
        print(f"{name} distribution (n={obj.shape[0]} rows):")
        print(df, "\n")

    elif isinstance(obj, pd.Series):
        # Case: single-label flat vector
        dist = obj.value_counts().sort_index()
        pct  = (dist / len(obj) * 100).round(2)
        df   = pd.DataFrame({'count': dist, 'pct': pct})
        print(f"{name} distribution (n={len(obj)} labels):")
        print(df, "\n")

    else:
        raise ValueError(f"print_dist: unsupported type {type(obj)}")

## Data Pool

We load the manually labeled dataset from the CSV file located in `DATA_DIR`. The dataset includes review (`title`) and (`body`), their IDs and assigned labels (`label`).

In [None]:
# Load manually labeled data pool for training and validation splits for model fine-tuning and performance evaluation 
df = pd.read_csv(os.path.join(DATA_DIR, "02_demo_reviews_all_labeled.csv"),
    sep=",",
    dtype={'review_id': str, 'user_id': str, 'title': str, 'body':str, 'label': str, 'id': str, 'app_id':str},
    low_memory=False
)

# Plain merge for ML pipelines
df["review_text_plain"] = (df["title"].fillna("").str.strip() + " " + df["body"].fillna("").str.strip())

# Tagged merge for LLMs
df["review_text_tagged"] = (
    "Title: " + df["title"].fillna("").str.strip()
    + "\nBody: " + df["body"].fillna("").str.strip()
)

# Clean and prepare labels
df["label"] = df["label"].str.strip()
df["split_labels"] = (
    df["label"]
      .apply(lambda x: [lbl.strip() for lbl in x.split(";") if lbl.strip().isdigit()])
      .apply(lambda lst: sorted(set(lbl for lbl in lst if 0 <= int(lbl) <= 8)))
) # Split on ";" and keep only digits 0–8, sorted & unique
df["sorted_labels"] = df["split_labels"].apply(lambda lst: ";".join(lst))

# Binarize all labels
mlb    = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
Y_all  = mlb.fit_transform(df["split_labels"])
CLASSES = mlb.classes_

# Class Distribution Overview
all_lbls = [lbl for sub in df["split_labels"] for lbl in sub]
print_dist(pd.Series(all_lbls), "Single-Label") # Single-label distribution

print_dist(df["sorted_labels"], "Multi-Label Combo") # Multi-label combination distribution

## Validation and Training Data Splits
We generate random subsamples from the labeled sample pool: a validation set of n = 1,000 reviews serving as a holdout and training sets with varying sample sizes of n = 2,000; 1,000; 500; 250; 100 reviews. Subsamples were constructed to reflect both a class distribution similar to that of the overall pool (representative) and, when possible, an equally balanced class distribution.

In [None]:
# Define constants (here set based on data pool class distribution)
VAL_SIZE        = 1000      # Number of validation samples to hold out
MIN_PER_CLASS   = 3         # Minimum labeled texts per class when sampling training data
TRAINING_SIZES  = [2000, 1000, 500, 250, 100]  # Various training set sizes to generate

### Validation Set Creation

We create a validation set of size `VAL_SIZE`. The remaining data forms the training pool.

In [None]:
# Build index array for splitting
indices = np.arange(len(df)).reshape(-1, 1)

msss_val = MultilabelStratifiedShuffleSplit(
    n_splits=1,
    test_size=VAL_SIZE,
    random_state=RANDOM_STATE
)
train_idx, val_idx = next(msss_val.split(indices, Y_all))

df_val        = df.iloc[val_idx].reset_index(drop=True)
Y_val         = Y_all[val_idx]
df_train_pool = df.iloc[train_idx].reset_index(drop=True)
Y_train_pool  = Y_all[train_idx]

# Single‐label distribution in the validation set
all_lbls_val = [lbl for sub in df_val["split_labels"] for lbl in sub]
print_dist(pd.Series(all_lbls_val), "Validation Single‐Label")

# Multi‐label‐combo distribution in the validation set
print_dist(df_val["sorted_labels"], "Validation Multi‐Label Combo")

print(f"Training pool size (n={len(df_train_pool)})\n")

### Training Set Creation

For each size in `TRAINING_SIZES`, we draw a stratified subset from the training pool, both real-world weighted and equal-distribution.

#### Sampling Functions

We define helper functions to sample training subsets either reflecting representative (real-world) distributions or balanced (equal) class distribution, with at least `MIN_PER_CLASS` samples per class.

In [None]:
def sample_real_world_clamp(df_pool,
                           size,
                           label_col='split_labels',
                           classes=[str(i) for i in range(9)],
                           min_per_class=MIN_PER_CLASS,
                           random_state=RANDOM_STATE):
    """
    Sample `size` rows from a multi-label pool, roughly preserving label frequencies,
    while ensuring each label appears at least `min_per_class` times.

    Uses MultilabelStratifiedShuffleSplit for initial approximation and clamps final result.
    """
    rng = np.random.RandomState(random_state)

    # Transform multilabel column to binary indicator matrix
    Y = mlb.transform(df_pool[label_col])
    msss = MultilabelStratifiedShuffleSplit(
        n_splits=1, test_size=size, random_state=random_state
    )
    _, idx = next(msss.split(np.zeros((len(df_pool),1)), Y))
    sel = set(idx)

    # Track current label counts in selection
    counts = {lbl: 0 for lbl in classes}
    for i in sel:
        for lbl in df_pool.at[i, label_col]:
            counts[lbl] += 1

    # Add missing labels to reach min_per_class
    for lbl in classes:
        short = max(0, min_per_class - counts[lbl])
        if short:
            mask = df_pool[label_col].apply(lambda L: lbl in L)
            pool = [i for i in df_pool.index[mask] if i not in sel]
            take = min(short, len(pool))
            chosen = rng.choice(pool, take, replace=False)
            sel.update(chosen)
            counts[lbl] += take

    # If selection too large, remove 'safe' rows first
    if len(sel) > size:
        need_rm = len(sel) - size
        removable = [
            i for i in sel
            if all(counts[l] - 1 >= min_per_class 
                   for l in df_pool.at[i, label_col])
        ]

        if len(removable) >= need_rm:
            to_remove = list(rng.choice(removable, need_rm, replace=False))
        else:
            to_remove = list(removable)
            rest = list(sel - set(removable))
            extra = rng.choice(rest, need_rm - len(removable), replace=False)
            to_remove.extend(extra)

        for i in to_remove:
            sel.remove(i)
            for lbl in df_pool.at[i, label_col]:
                counts[lbl] -= 1

    # If still short, randomly fill up to target size
    if len(sel) < size:
        remaining = list(set(df_pool.index) - sel)
        need = size - len(sel)
        added = rng.choice(remaining, need, replace=False)
        for i in added:
            sel.add(i)
            for lbl in df_pool.at[i, label_col]:
                counts[lbl] += 1

    # Final sample and shuffle
    return (
        df_pool.loc[list(sel)]
               .sample(frac=1, random_state=random_state)
               .reset_index(drop=True)
    )

def sample_equal_clamp(df_pool,
                       size,
                       label_col='split_labels',
                       classes=CLASSES,
                       min_per_class=MIN_PER_CLASS,
                       random_state=RANDOM_STATE):
    """
    Sample `size` rows with equal label representation across `classes`.
    Each label appears at least `min_per_class` times, or size // n_labels, whichever is larger.
    """
    rng = np.random.RandomState(random_state)
    sel = set()

    # Target per label (equal allocation)
    per_lbl = max(min_per_class, size // len(classes))

    # Sample independently for each label
    for lbl in classes:
        idxs = df_pool.index[df_pool[label_col].apply(lambda L: lbl in L)]
        k    = min(per_lbl, len(idxs))
        if k:
            chosen = rng.choice(idxs, k, replace=False)
            sel |= set(chosen)

    # Track current label counts
    counts = {lbl: 0 for lbl in classes}
    for i in sel:
        for lbl in df_pool.at[i, label_col]:
            counts[lbl] += 1

    # If selection too large, remove 'safe' rows first
    if len(sel) > size:
        need_rm = len(sel) - size
        removable = [
            i for i in sel
            if all(counts[l] - 1 >= min_per_class
                   for l in df_pool.at[i, label_col])
        ]

        if len(removable) >= need_rm:
            to_remove = rng.choice(removable, need_rm, replace=False)
        else:
            to_remove = list(removable)
            rest = list(sel - set(removable))
            extra = rng.choice(rest, need_rm - len(removable), replace=False)
            to_remove.extend(extra)

        for i in to_remove:
            sel.remove(i)
            for lbl in df_pool.at[i, label_col]:
                counts[lbl] -= 1

    # If still short, randomly fill up to target size
    if len(sel) < size:
        remaining = list(set(df_pool.index) - sel)
        need = size - len(sel)
        added = rng.choice(remaining, need, replace=False)
        for i in added:
            sel.add(i)
            for lbl in df_pool.at[i, label_col]:
                counts[lbl] += 1

    # Final sample and shuffle
    return (
        df_pool.loc[list(sel)]
               .sample(frac=1, random_state=random_state)
               .reset_index(drop=True)
    )

#### Generate Multi-Label Stratified Training Sets

In [None]:
# Initialize containers for training sets
d_real  = {}
d_equal = {}

# For each specified training set size, create two pools (clamped to minimum per class)
for size in TRAINING_SIZES:
    real_df = sample_real_world_clamp(df_train_pool, size)
    y_real  = mlb.transform(real_df["split_labels"])
    d_real[size] = (real_df, y_real)

    eq_df   = sample_equal_clamp(df_train_pool, size)
    y_eq    = mlb.transform(eq_df["split_labels"])
    d_equal[size] = (eq_df, y_eq)

    # now a single call each
    print_dist(y_real, name=f"Real-world single-label", classes=CLASSES)
    print_dist(y_eq, name=f"Approx-equal single-label", classes=CLASSES)

#### Save Splits

We save the validation and training splits to `DATA_DIR`.

In [None]:
# Save validation set
df_val["multi_hot"] = [json.dumps(vec.tolist()) for vec in Y_val] # convert each row’s vector to a JSON string
df_val.to_csv(
    os.path.join(DATA_DIR, "demo_product_reviews_validation_real_1000.csv"),
    index=False,
    quoting=csv.QUOTE_MINIMAL
)

# Save representative training sets
for size, (df_tr, y_tr) in d_real.items():
    # turn inner numpy arrays into Python lists, then dump to JSON
    df_tr["multi_hot"] = [json.dumps(vec.tolist()) for vec in y_tr]
    df_tr.to_csv(
        os.path.join(DATA_DIR, f"demo_product_reviews_train_real_{size}.csv"),
        index=False,
        quoting=csv.QUOTE_MINIMAL
    )

# Save balanced training sets
for size, (df_tr, y_tr) in d_equal.items():
    df_tr["multi_hot"] = [json.dumps(vec.tolist()) for vec in y_tr]
    df_tr.to_csv(
        os.path.join(DATA_DIR, f"demo_product_reviews_train_equal_{size}.csv"),
        index=False,
        quoting=csv.QUOTE_MINIMAL
    )

## Data Preparation Fine-Tuning
We prepare JSONL files for LLM APIs (OpenAI, Mistral) using the prompt from section 4.2.2 of the paper. The APIs of OpenAI and Mistral AI require inputs in .jsonl format. Files can be used for both APIs. Afterward, we validate each generated JSONL file for correct API use.

In [None]:
# Define the prompt
PROMPT_TEMPLATE = f"""ONLY provide a number (0-8) in response. Categorize the following app review text by assigning the most fitting category/categories out of the following nine categories.
If the text contains elements from multiple categories, provide the categories separated by ;

(0)	User Opinion without specific reports, issues, suggestions - i.e., review only about good/bad perception of the app but nothing else
(1)	Reports of bugs, errors, or bad quality issues - i.e., something does not work in the app or is of bad quality
(2)	Issues of the app's monetization model - i.e., complaints or issues of how the app monetizes content
(3)	Suggestions for new features or content or revival of removed features
(4)	Customer support issues - i.e., problems or complaints regarding customer support
(5)	Performance issues - i.e., the app needs to much space, is to slow or similar
(6)	Security concerns - i.e., user is concerned about their data or privacy
(7)	Ethical concerns - i.e., user is concerned about practices in the app, fairness, discrimination
(8)	Community related issues - user asks openly for help (no feedback at customer support)
"""

In [None]:
# Helper function to create jsonl files for datasets in DATA_DIR
def csv_to_jsonl(csv_path: str, jsonl_path: str, PROMPT_TEMPLATE: str) -> None:
    """
    Read a CSV, build prompts, and write out a JSONL file
    where each line is:
      {
        "messages": [
          {"role":"user",      "content": <prompt>},
          {"role":"assistant", "content": <label>}
        ]
      }
    """
    # Read CSV
    df_in = pd.read_csv(csv_path,
        sep=",",
        header=0,
        dtype={'review_id': str, 'user_id': str, 'title': str, 'body':str, 'review_text_tagged':str, 'label': str, 'id': str, 'app_id':str},
        low_memory=False
    )

    # Merge title & body
    df_in['app_review'] = df_in['review_text_tagged'].fillna('')

    # Build the prompt column
    df_in['prompt'] = PROMPT_TEMPLATE + "\n\nApp review text: " + df_in['app_review']

    # Assemble JSONL entries
    with open(jsonl_path, 'w', encoding='utf-8') as fout:
        for _, row in df_in.iterrows():
            record = {
                "messages": [
                    {"role": "user",      "content": row.prompt},
                    {"role": "assistant", "content": str(row.label)}
                ]
            }
            fout.write(json.dumps(record) + '\n')

# Loop over every CSV in the folder
pattern = os.path.join(DATA_DIR, "demo_product_reviews_train_*.csv")
for csv_path in glob.glob(pattern):
    base = os.path.splitext(os.path.basename(csv_path))[0]
    jsonl_name = f"{base}.jsonl"
    jsonl_path = os.path.join(LLM_API_FOLDER, jsonl_name)

    print(f"Converting {os.path.basename(csv_path)} → {jsonl_name}...")
    csv_to_jsonl(csv_path, jsonl_path, PROMPT_TEMPLATE)

print("All files processed.")

In [None]:
# Helper function to validate jsonl files for API usage
def check_jsonl_file(jsonl_path: str) -> None:
    """
    Load a JSONL file, print one example, and report any format errors.
    """
    # Load all lines into Python objects
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    # Print header and one example
    print(f"Checking {os.path.basename(jsonl_path)}")
    print("Num examples:", len(dataset))
    if dataset:
        print("Example[0] messages:")
        for msg in dataset[0].get("messages", []):
            print(f"  {msg}")

    # Initialize error counters
    format_errors = defaultdict(int)

    # Validate each example
    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages")
        if not isinstance(messages, list) or not messages:
            format_errors["missing_or_empty_messages_list"] += 1
            continue

        # Validate each message in the list
        for message in messages:
            # Required keys: role, content
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            # No unexpected keys
            if any(k not in ("role", "content", "name") for k in message):
                format_errors["message_unrecognized_key"] += 1

            # Role must be one of the allowed set
            role = message.get("role")
            if role not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            # Content must be a nonempty string
            content = message.get("content")
            if not isinstance(content, str) or content.strip() == "":
                format_errors["missing_or_invalid_content"] += 1

        # Ensure there's exactly one assistant response
        if not any(m.get("role") == "assistant" for m in messages):
            format_errors["missing_assistant_message"] += 1

    # Print summary of any errors found
    if format_errors:
        print("Found format errors:")
        for err, count in format_errors.items():
            print(f"  {err}: {count}")
    else:
        print("No errors found.")

# Loop over all .jsonl files in the directory and run checks
for filename in os.listdir(LLM_API_FOLDER):
    if filename.lower().endswith('.jsonl'):
        check_jsonl_file(os.path.join(LLM_API_FOLDER, filename))