In [6]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from readability import Readability
import pickle

In [7]:
def load_df_with_embeddings(csv_path, emb_paths, emb_cols):
    df = pd.read_csv(csv_path)

    for emb_path, emb_col in zip(emb_paths, emb_cols):
        with open(emb_path, 'rb') as f:
            embeddings = pickle.load(f)
        df[emb_col] = embeddings.tolist()

    return df

In [8]:
def compute_readability(text):
    r = Readability(text)
    try:
        fk = r.flesch_kincaid()
        fk_score = fk.score
    except:
        fk_score = None

    return fk_score

In [9]:
def equal_frequency_binning(data, num_bins):
    """
    Bucketize the data into bins with approximately equal number of data points.
    
    Parameters:
    - data (list or np.array): The data to be bucketized.
    - num_bins (int): Number of bins desired.
    
    Returns:
    - bins (list of tuples): List of intervals representing the bins.
    """
    
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    
    # Sort the data
    sorted_data = np.sort(data)
    
    # Calculate bin edges using quantiles
    bin_edges = [np.percentile(sorted_data, i) for i in np.linspace(0, 100, num_bins+1)]
    
    # Create bins as tuples of (start, end)
    bins = [(bin_edges[i], bin_edges[i+1]) for i in range(len(bin_edges)-1)]
    
    return bins

In [5]:
def bin_data(data, bins, bin_names):
    """
    Convert continuous data into categorical data using specified bins and bin names.
    
    Parameters:
    - data (list or np.array): The data to be bucketized.
    - bins (list of tuples): List of intervals representing the bins.
    - bin_names (list of str): Names for each bin.
    
    Returns:
    - categorical_data (list of str): Categorical representation of the data.
    """
    
    if not isinstance(data, np.ndarray):
        data = np.array(data)
    
    if len(bins) != len(bin_names):
        raise ValueError("Number of bins and bin names should be the same.")
    
    # Initialize an empty list to store the categorical data
    categorical_data = []
    
    # Loop over each data point to assign it to a bin
    for value in data:
        assigned = False
        for i, (start, end) in enumerate(bins):
            if start <= value < end or (i == len(bins) - 1 and value == end):
                categorical_data.append(bin_names[i])
                assigned = True
                break
        if not assigned:
            categorical_data.append('Unknown')
    
    return categorical_data


In [10]:
dolly_cqa_prepared_df = load_df_with_embeddings(csv_path='./data/dolly_cqa.csv',
                                                emb_paths=['./data/embeddings_dolly_mini_lm.pickle', './data/embeddings_dolly_bge_large.pickle'],
                                                emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])

dolly_cqa_prepared_df['fk_score'] = dolly_cqa_prepared_df['text'].apply(lambda x: compute_readability(x))
dolly_cqa_prepared_df = dolly_cqa_prepared_df.dropna()
bins = equal_frequency_binning(dolly_cqa_prepared_df['fk_score'], num_bins=3)
dolly_cqa_prepared_df['fk_score_categ'] = bin_data(dolly_cqa_prepared_df['fk_score'], bins, ['easy', 'medium', 'hard'])

dolly_cqa_prepared_df.to_parquet('./data/dolly_cqa_prepared.parquet', index=False)

In [11]:
dolly_cqa_prepared_df

Unnamed: 0,instruction,context,response,category,text,embeddings_mini_lm,embeddings_bge_large,fk_score,fk_score_categ
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa,"### CONTEXT ###\nVirgin Australia, the trading...","[0.07990753650665283, -0.04975801706314087, 0....","[0.5005730390548706, -0.46779343485832214, -1....",9.131896,easy
1,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa,### CONTEXT ###\nKomorida was born in Kumamoto...,"[-0.011252421885728836, -0.04460098594427109, ...","[0.64762282371521, 0.43752172589302063, -0.047...",7.643152,easy
2,"Given a reference text about Lollapalooza, whe...",Lollapalooza /ˌlɒləpəˈluːzə/ (Lolla) is an ann...,Lollapalooze is an annual musical festival hel...,closed_qa,### CONTEXT ###\nLollapalooza /ˌlɒləpəˈluːzə/ ...,"[0.06580415368080139, -0.0366082526743412, 0.0...","[0.5144393444061279, 0.18275967240333557, -0.4...",11.617224,medium
3,Who was Kyle Van Zyl playing against when he s...,Van Zyl joined the Eastern Province Kings Acad...,Kyle Van Zyl was playing against Boland U21 wh...,closed_qa,### CONTEXT ###\nVan Zyl joined the Eastern Pr...,"[0.019957538694143295, 0.04702803120017052, -0...","[1.1888214349746704, 0.058644916862249374, -0....",11.474969,medium
4,Who was the first to record the Needles and Pi...,"""Needles and Pins"" is a rock song credited to ...",The first artist to record Needles and Pins so...,closed_qa,"### CONTEXT ###\n""Needles and Pins"" is a rock ...","[-0.08916875720024109, 0.0042908587493002415, ...","[-0.08973768353462219, -0.17643515765666962, 0...",9.573611,easy
...,...,...,...,...,...,...,...,...,...
1767,How many world championships has Max Verstappe...,"After winning the 2021 Abu Dhabi Grand Prix, V...",Max Verstappen has won 2 Formula One World Cha...,closed_qa,### CONTEXT ###\nAfter winning the 2021 Abu Dh...,"[0.044786978513002396, 0.0190593209117651, -0....","[0.5898312330245972, 0.14761556684970856, -0.4...",7.991569,easy
1769,What is the history of coffee?,The history of coffee dates back to centuries ...,Much of the early history of coffee was tied t...,closed_qa,### CONTEXT ###\nThe history of coffee dates b...,"[0.019176138564944267, 0.02772548422217369, -0...","[0.10451232641935349, 0.22819367051124573, -0....",11.447779,medium
1770,What are common florals found in Zigalga Natio...,Zigalga National Park (Russian: Национальный п...,Zigalga National Park has the majority of its ...,closed_qa,### CONTEXT ###\nZigalga National Park (Russia...,"[-0.026473581790924072, 0.09908297657966614, -...","[0.1897222250699997, 0.41683098673820496, 0.02...",12.950833,medium
1771,What is linux Bootloader,"A bootloader, also spelled as boot loader or c...",A bootloader is a program written in machine c...,closed_qa,"### CONTEXT ###\nA bootloader, also spelled as...","[-0.05690043047070503, 0.05346180498600006, -0...","[0.32251039147377014, -0.2482924461364746, 0.0...",19.799659,hard


In [12]:
def prepare_race_dataset(csv_path, emb_paths, emb_cols):
    race_prepared_df = load_df_with_embeddings(csv_path, emb_paths, emb_cols)
    race_prepared_df['fk_score'] = race_prepared_df['text'].apply(lambda x: compute_readability(x))
    race_prepared_df = race_prepared_df.dropna()
    bins = equal_frequency_binning(race_prepared_df['fk_score'], num_bins=3)
    race_prepared_df['fk_score_categ'] = bin_data(race_prepared_df['fk_score'], bins, ['easy', 'medium', 'hard'])
    return race_prepared_df

race_train_df = prepare_race_dataset(csv_path='./data/race_train.csv',
                                     emb_paths=['./data/embeddings_train_mini_lm.pickle', './data/embeddings_train_bge_large.pickle'],
                                     emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])
race_train_df.to_parquet('./data/race_train_prepared.parquet', index=False)

race_validation_df = prepare_race_dataset(csv_path='./data/race_validation.csv',
                                          emb_paths=['./data/embeddings_validation_mini_lm.pickle', './data/embeddings_validation_bge_large.pickle'],
                                          emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])
race_validation_df.to_parquet('./data/race_validation_prepared.parquet', index=False)

race_test_df = prepare_race_dataset(csv_path='./data/race_test.csv',
                                    emb_paths=['./data/embeddings_test_mini_lm.pickle', './data/embeddings_test_bge_large.pickle'],
                                    emb_cols=['embeddings_mini_lm', 'embeddings_bge_large'])
race_test_df.to_parquet('./data/race_test_prepared.parquet', index=False)

In [13]:
race_train_df

Unnamed: 0,example_id,article,answer,question,options,difficulty,text,embeddings_mini_lm,embeddings_bge_large,fk_score,fk_score_categ
0,middle6454.txt,"Hans said to his friend Kurt, ""I'm going to ta...",3,Hans went to London by _ .,['car' 'sea' 'air' 'both sea and land'],M,"### CONTEXT ###\nHans said to his friend Kurt,...","[0.038055792450904846, -0.10164088755846024, 0...","[0.711676299571991, 0.4126167297363281, -0.044...",2.798810,easy
1,middle234.txt,Bob is six years old.He is old enough to go to...,0,What do you think of the boy?,['He is not clever.' 'He is helpful.' 'He is p...,M,### CONTEXT ###\nBob is six years old.He is ol...,"[-0.030051156878471375, 0.022464649751782417, ...","[-0.4282720983028412, 0.38826122879981995, -0....",4.000531,easy
2,middle3148.txt,A teacher stood in front of his history class ...,1,Why did some students stay in their seats?,['Because they were afraid to leave.'\n 'Becau...,M,### CONTEXT ###\nA teacher stood in front of h...,"[-0.008928709663450718, -0.0035763238556683064...","[0.293457567691803, -0.10545913875102997, -0.3...",3.613056,easy
3,middle1661.txt,A famous building in New York City is turning ...,0,Which of the following statements can we infer...,"[""The station won't be changed.""\n 'People wil...",M,### CONTEXT ###\nA famous building in New York...,"[0.012237755581736565, 0.018459275364875793, 0...","[0.7666807174682617, 0.4434956908226013, -0.30...",6.238592,medium
5,middle1266.txt,Some people think only school children do not ...,2,How many pieces of advice does the writer give...,['5.' '4.' '3.' '2.'],M,### CONTEXT ###\nSome people think only school...,"[0.09224450588226318, 0.0763128250837326, 0.05...","[1.0863529443740845, 0.5433937311172485, 0.304...",5.066129,easy
...,...,...,...,...,...,...,...,...,...,...,...
38101,3154.txt,Sometimes you'll hear people say that you can'...,2,What is the passage mainly about?,['How to prepare for your success.' 'How to fa...,C,### CONTEXT ###\nSometimes you'll hear people ...,"[-0.08145900815725327, 0.05152837559580803, 0....","[0.3955683708190918, 0.2593410909175873, 0.310...",22.915212,hard
38102,1616.txt,"A pretty, well-dressed young lady stopped a ta...",0,The young lady was,['clever at making excuse.' 'not late at all.'...,C,"### CONTEXT ###\nA pretty, well-dressed young ...","[-0.015297356992959976, -0.0675896480679512, 0...","[-0.2136244773864746, 0.2980509400367737, -0.6...",5.220553,easy
38103,850.txt,"A fluid is a substance, such as a liquid or ga...",3,"According to paragraph 2, all of the following...",['the breaking apart of water molecules by ult...,C,"### CONTEXT ###\nA fluid is a substance, such ...","[-0.08058983832597733, -0.053635209798812866, ...","[0.17320740222930908, 0.11267031729221344, 0.0...",12.120323,hard
38104,827.txt,The National Trust in Britain plays an increas...,1,The National Trust is _ .,['a rich government department' 'a charity sup...,C,### CONTEXT ###\nThe National Trust in Britain...,"[0.04407892003655434, 0.07030923664569855, 0.0...","[0.7470288276672363, 0.23737327754497528, 0.48...",13.167530,hard
