# Data Preprocessing & Sampling

In [1]:
import pandas as pd

# Assuming raw data `SPOTIFIY_REVIEWS.csv` is located under `data` folder
df = pd.read_csv("../data/SPOTIFY_REVIEWS.csv")

selected_columns = [
    "review_text", 
    "review_rating", 
    "review_likes", 
    "author_app_version", 
    "review_timestamp"
]

df = df[selected_columns]

In [2]:
import pandas as pd
import json

def get_top_rows_by_rating(df, n=100, sort_column='review_likes'):
    """
    Get top rows for each review rating, sorted by a specified column.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - n (int): Number of top rows to retrieve for each rating (default is 100).
    - sort_column (str): Column to sort the rows by (default is 'review_likes').

    Returns:
    - dict: Dictionary where keys are review ratings, and values are DataFrames.
    """
    # Ensure 'n' is a positive integer
    if not isinstance(n, int) or n <= 0:
        raise ValueError("'n' should be a positive integer greater than 0.")

    # Ensure 'sort_column' is a valid column in the DataFrame
    if sort_column not in df.columns:
        raise ValueError(f"'{sort_column}' is not a valid column in the DataFrame.")

    # Create a dictionary to store DataFrames for each rating
    result_dict = {}
    df_combined = pd.DataFrame()

    # Iterate over the review ratings (5 to 1)
    for rating in range(5, 0, -1):
        # Filter rows where "review_rating" equals the current rating
        rating_rows = df[df['review_rating'] == rating]

        # Sort the filtered rows by specified column and take the top 'n' rows
        top_rows = rating_rows.sort_values(by=sort_column, ascending=False).head(n)

        df_combined = df_combined.append(top_rows)

    return df_combined

In [4]:
# Get DataFrames by rating with top 20 rows and sorting by 'review_likes'
most_liked_reviews = get_top_rows_by_rating(df=df, n=20, sort_column='review_likes')
json_data = most_liked_reviews.to_dict(orient='records')

# Convert DataFrames to JSON
with open('../data/preprocessed/sampled_reviews.jsonl', 'w') as outfile:
    for entry in json_data:
        json.dump(entry, outfile)
        outfile.write('\n')

  df_combined = df_combined.append(top_rows)
  df_combined = df_combined.append(top_rows)
  df_combined = df_combined.append(top_rows)
  df_combined = df_combined.append(top_rows)
  df_combined = df_combined.append(top_rows)
