In [1]:
import pandas as pd
import random 
from sklearn.model_selection import train_test_split

In [2]:
# read the csvs
training_data = pd.read_csv(filepath_or_buffer="TREC_dataset/train.csv", sep=",") 
test_data = pd.read_csv(filepath_or_buffer="TREC_dataset/test.csv", sep=",") 

# get all coarse labels, they are integers from 0 to 5
labels_coarse = list(set([x for x in training_data["label-coarse"]]))

# choosing two classes to merge
choices = random.sample(labels_coarse, k=2)
print(f"merged classes: {choices}")

# define the new class mappings old coarse labels (key): new coarse labels (key)
# new labels are 0, 1, 2, 3, 4, and 4 is OTHERS
new_labels = {}

curr = 0
for i in labels_coarse:
    if i not in new_labels:
        if i not in choices:
            new_labels[i] = curr
            curr += 1
        else:
            new_labels[i] = 4
            
# replace all the labels in the training and test dataset
for i, row in training_data.iterrows():
    training_data.at[i,'label-coarse'] = new_labels[training_data.at[i,'label-coarse']]
    
for i, row in test_data.iterrows():
    test_data.at[i,'label-coarse'] = new_labels[training_data.at[i,'label-coarse']]
    
# save the files
training_data.drop(columns=['label-fine'], inplace=True)
test_data.drop(columns=['label-fine'], inplace=True)
training_data.to_csv(path_or_buf="TREC_dataset/modified_training_data.csv", index=False, sep=',')
test_data.to_csv(path_or_buf="TREC_dataset/modified_test_data.csv", index=False, sep=',')

merged classes: [0, 4]


# To add to Q2

In [3]:
training_data = pd.read_csv(filepath_or_buffer="TREC_dataset/modified_training_data.csv", sep=",") 
test_data = pd.read_csv(filepath_or_buffer="TREC_dataset/modified_test_data.csv", sep=",")

X = training_data["text"]
y = training_data["label-coarse"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=500)

X_test = test_data["text"]
y_test = test_data["label-coarse"]

In [4]:
X_train

2966     What 's the most common name in nursery rhymes ?
4081    What was the largest city in the world to decl...
2253        Which area produces the least acidic coffee ?
4344                          Where was Pythagoras born ?
1166    How do you tell your parents you are dating an...
                              ...                        
1397    What is the name of the medical condition in w...
4350                       What is the capital of Italy ?
5097                What is a fear of home surroundings ?
2819               What time of year do most people fly ?
3568                                Who wrote the Bible ?
Name: text, Length: 4952, dtype: object

In [5]:
from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

text = "Who wrote the hymn `` Amazing Grace '' ?"
print(word_tokenize(text.lower()))

['who', 'wrote', 'the', 'hymn', '``', 'amazing', 'grace', '``', '?']


In [10]:
X_train_lst = X_train.to_list()
def tokenize_pd_series_to_lsit(list_of_text):
    tokenized = []
    for sentence in list_of_text:
        tokenized.append(word_tokenize(sentence.lower()))
    return tokenized
X_train_tokenized = tokenize_pd_series_to_lsit(X_train_lst)
X_val_tokenized = tokenize_pd_series_to_lsit(X_train_lst)
X_test_tokenized = tokenize_pd_series_to_lsit(X_train_lst)

[['what',
  "'s",
  'the',
  'most',
  'common',
  'name',
  'in',
  'nursery',
  'rhymes',
  '?'],
 ['what',
  'was',
  'the',
  'largest',
  'city',
  'in',
  'the',
  'world',
  'to',
  'declare',
  'martial',
  'law',
  'in',
  '1989',
  '?'],
 ['which', 'area', 'produces', 'the', 'least', 'acidic', 'coffee', '?'],
 ['where', 'was', 'pythagoras', 'born', '?'],
 ['how',
  'do',
  'you',
  'tell',
  'your',
  'parents',
  'you',
  'are',
  'dating',
  'an',
  'older',
  'man',
  '?'],
 ['what', 'kennedy', 'was', 'married', 'to', 'ethel', '?'],
 ['what', 'state', 'is', 'known', 'as', 'the', 'hawkeye', 'state', '?'],
 ['who', 'discovered', 'electricity', '?'],
 ['what', 'u.s.', 'state', 'borders', 'illinois', 'to', 'the', 'north', '?'],
 ['what',
  'film',
  'canine',
  'is',
  'buried',
  'in',
  'pere-lachaise',
  'cemetery',
  'in',
  'paris',
  '?'],
 ['what', 'does', 'the', 'name', 'gina', 'mean', '?'],
 ['what',
  'soft',
  'drink',
  'held',
  'a',
  'national',
  'flavor',
  'p