In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from readability import Readability
import pickle

In [None]:
def load_df_with_embeddings(csv_path, emb_paths, emb_cols):
    df = pd.read_csv(csv_path)

    for emb_path, emb_col in zip(emb_paths, emb_cols):
        with open(emb_path, 'rb') as f:
            embeddings = pickle.load(f)
        df[emb_col] = embeddings.tolist()

    return df

In [None]:
def compute_readability(text):
    r = Readability(text)
    try:
        fk = r.flesch_kincaid()
        fk_score = fk.score
    except:
        fk_score = None

    return fk_score

In [None]:
def equal_frequency_binning(data, num_bins):
    """
    Bucketize the data into bins with approximately equal number of data points.
    
    Parameters:
    - data (list or np.array): The data to be bucketized.
    - num_bins (int): Number of bins desired.
    
    Returns:
    - bins (list of tuples): List of intervals representing the bins.
    """
    
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    
    # Sort the data
    sorted_data = np.sort(data)
    
    # Calculate bin edges using quantiles
    bin_edges = [np.percentile(sorted_data, i) for i in np.linspace(0, 100, num_bins+1)]
    
    # Create bins as tuples of (start, end)
    bins = [(bin_edges[i], bin_edges[i+1]) for i in range(len(bin_edges)-1)]
    
    return bins

In [None]:
def bin_data(data, bins, bin_names):
    """
    Convert continuous data into categorical data using specified bins and bin names.
    
    Parameters:
    - data (list or np.array): The data to be bucketized.
    - bins (list of tuples): List of intervals representing the bins.
    - bin_names (list of str): Names for each bin.
    
    Returns:
    - categorical_data (list of str): Categorical representation of the data.
    """
    
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    
    if len(bins) != len(bin_names):
        raise ValueError("Number of bins and bin names should be the same.")
    
    # Initialize an empty list to store the categorical data
    categorical_data = []
    
    # Loop over each data point to assign it to a bin
    for value in data:
        assigned = False
        for i, (start, end) in enumerate(bins):
            if start <= value < end or (i == len(bins) - 1 and value == end):
                categorical_data.append(bin_names[i])
                assigned = True
                break
        if not assigned:
            categorical_data.append('Unknown')
    
    return categorical_data


In [None]:
dolly_cqa_prepared_df = load_df_with_embeddings(csv_path='./data/dolly_cqa.csv',
                                                emb_paths=['./data/embeddings_dolly_mini_lm.pickle', './data/embeddings_dolly_bge_large.pickle'],
                                                emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])

dolly_cqa_prepared_df['fk_score'] = dolly_cqa_prepared_df['text'].apply(lambda x: compute_readability(x))
dolly_cqa_prepared_df = dolly_cqa_prepared_df.dropna()
bins = equal_frequency_binning(dolly_cqa_prepared_df['fk_score'], num_bins=3)
dolly_cqa_prepared_df['fk_score_categ'] = bin_data(dolly_cqa_prepared_df['fk_score'], bins, ['easy', 'medium', 'hard'])

dolly_cqa_prepared_df.to_parquet('./data/dolly_cqa_prepared.parquet', index=False)

In [None]:
dolly_cqa_prepared_df

In [None]:
def prepare_race_dataset(csv_path, emb_paths, emb_cols):
    race_prepared_df = load_df_with_embeddings(csv_path, emb_paths, emb_cols)
    race_prepared_df['fk_score'] = race_prepared_df['text'].apply(lambda x: compute_readability(x))
    race_prepared_df = race_prepared_df.dropna()
    bins = equal_frequency_binning(race_prepared_df['fk_score'], num_bins=3)
    race_prepared_df['fk_score_categ'] = bin_data(race_prepared_df['fk_score'], bins, ['easy', 'medium', 'hard'])
    return race_prepared_df

race_train_df = prepare_race_dataset(csv_path='./data/race_train.csv',
                                     emb_paths=['./data/embeddings_train_mini_lm.pickle', './data/embeddings_train_bge_large.pickle'],
                                     emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])
race_train_df.to_parquet('./data/race_train_prepared.parquet', index=False)

race_validation_df = prepare_race_dataset(csv_path='./data/race_validation.csv',
                                          emb_paths=['./data/embeddings_validation_mini_lm.pickle', './data/embeddings_validation_bge_large.pickle'],
                                          emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])
race_validation_df.to_parquet('./data/race_validation_prepared.parquet', index=False)

race_test_df = prepare_race_dataset(csv_path='./data/race_test.csv',
                                    emb_paths=['./data/embeddings_test_mini_lm.pickle', './data/embeddings_test_bge_large.pickle'],
                                    emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])
race_test_df.to_parquet('./data/race_test_prepared.parquet', index=False)

In [None]:
race_train_df