In [1]:
import pandas as pd
import random 
from sklearn.model_selection import train_test_split
import numpy as np
import torch

In [2]:
# read the csvs
training_data = pd.read_csv(filepath_or_buffer="TREC_dataset/train.csv", sep=",") 
test_data = pd.read_csv(filepath_or_buffer="TREC_dataset/test.csv", sep=",") 

# get all coarse labels, they are integers from 0 to 5
labels_coarse = list(set([x for x in training_data["label-coarse"]]))

# choosing two classes to merge
merged_old_labels = random.sample(labels_coarse, k=2)
print(f"merged classes: {merged_old_labels}")

# define the new class mappings old coarse labels (key): new coarse labels (key)
# new labels are 0, 1, 2, 3, 4, and 4 is OTHERS
old_to_new_labels = {}
new_to_old_labels ={}

next_new_label = 0

for old_label in labels_coarse:
    if old_label not in merged_old_labels:
        old_to_new_labels[old_label] = next_new_label
        new_to_old_labels[next_new_label] = old_label
        next_new_label += 1
    else:
        old_to_new_labels[old_label] = 4
        
new_to_old_labels[4] = merged_old_labels
            
# replace all the labels in the training and test dataset
for i, row in training_data.iterrows():
    training_data.at[i,'label-coarse'] = old_to_new_labels[training_data.at[i,'label-coarse']]
    
for i, row in test_data.iterrows():
    test_data.at[i,'label-coarse'] = old_to_new_labels[test_data.at[i,'label-coarse']]
    
# save the files
training_data.drop(columns=['label-fine'], inplace=True)
test_data.drop(columns=['label-fine'], inplace=True)
training_data.to_csv(path_or_buf="TREC_dataset/modified_training_data.csv", index=False, sep=',')
test_data.to_csv(path_or_buf="TREC_dataset/modified_test_data.csv", index=False, sep=',')

merged classes: [3, 4]


In [3]:
old_to_new_labels

{0: 0, 1: 1, 2: 2, 3: 4, 4: 4, 5: 3}

In [4]:
new_to_old_labels

{0: 0, 1: 1, 2: 2, 3: 5, 4: [3, 4]}