In [54]:
import pandas as pd
import random 
from sklearn.model_selection import train_test_split
import numpy as np
import torch

In [55]:
# read the csvs
training_data = pd.read_csv(filepath_or_buffer="TREC_dataset/train.csv", sep=",") 
test_data = pd.read_csv(filepath_or_buffer="TREC_dataset/test.csv", sep=",") 

# get all coarse labels, they are integers from 0 to 5
labels_coarse = list(set([x for x in training_data["label-coarse"]]))

# choosing two classes to merge
choices = random.sample(labels_coarse, k=2)
print(f"merged classes: {choices}")

# define the new class mappings old coarse labels (key): new coarse labels (key)
# new labels are 0, 1, 2, 3, 4, and 4 is OTHERS
new_labels = {}

curr = 0
for i in labels_coarse:
    if i not in new_labels:
        if i not in choices:
            new_labels[i] = curr
            curr += 1
        else:
            new_labels[i] = 4
            
# replace all the labels in the training and test dataset
for i, row in training_data.iterrows():
    training_data.at[i,'label-coarse'] = new_labels[training_data.at[i,'label-coarse']]
    
for i, row in test_data.iterrows():
    test_data.at[i,'label-coarse'] = new_labels[training_data.at[i,'label-coarse']]
    
# save the files
training_data.drop(columns=['label-fine'], inplace=True)
test_data.drop(columns=['label-fine'], inplace=True)
training_data.to_csv(path_or_buf="TREC_dataset/modified_training_data.csv", index=False, sep=',')
test_data.to_csv(path_or_buf="TREC_dataset/modified_test_data.csv", index=False, sep=',')

merged classes: [3, 0]


# To add to Q2

In [56]:
training_data = pd.read_csv(filepath_or_buffer="TREC_dataset/modified_training_data.csv", sep=",") 
test_data = pd.read_csv(filepath_or_buffer="TREC_dataset/modified_test_data.csv", sep=",")

X = training_data["text"]
y = training_data["label-coarse"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=500)

X_test = test_data["text"]
y_test = test_data["label-coarse"]

In [57]:
X_train

1428        What country 's northernmost city is Darwin ?
3826           What countries have the most auto thefts ?
4787    What card game derived its name from biritch ,...
2871    What Broadway musical featured the song , `` I...
3839          What is the most popular sports car color ?
                              ...                        
3519              Who is the Incredible Hulk in reality ?
1033    What does the Ouarterly Review of Doublespeak ...
1121            Who is the current UN Secretary General ?
3462             Name one of the major gods of Hinduism .
3689    How can a foreigner get a U.S. Social Security...
Name: text, Length: 4952, dtype: object

In [58]:
from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

text = "Who wrote the hymn `` Amazing Grace '' ?"
print(word_tokenize(text.lower()))

['who', 'wrote', 'the', 'hymn', '``', 'amazing', 'grace', '``', '?']


In [59]:
X_train_lst = X_train.to_list()
X_val_lst = X_val.to_list()
X_test_lst = X_test.to_list()

def tokenize_pd_series_to_lsit(list_of_text):
    tokenized = []
    for sentence in list_of_text:
        tokenized.append(word_tokenize(sentence.lower()))
    return tokenized
X_train_tokenized = tokenize_pd_series_to_lsit(X_train_lst)
X_val_tokenized = tokenize_pd_series_to_lsit(X_val_lst)
X_test_tokenized = tokenize_pd_series_to_lsit(X_test_lst)
X_test_tokenized = tokenize_pd_series_to_lsit(X_train_lst)

In [60]:
no_of_labels = max(y_train.to_list()) + 1
no_of_labels

5

In [61]:
def format_label(label):
    return torch.unsqueeze(torch.tensor(label.to_list()), axis=1).tolist()

y_train_formatted = format_label(y_train)
y_val_formatted = format_label(y_val)
y_test_formatted = format_label(y_test)

In [62]:
import gensim.downloader
word2vec_goog1e_news: gensim.models.keyedvectors.KeyedVectors = gensim.downloader.load('word2vec-google-news-300')
word2vec_goog1e_news.add_vector("<pad>", np.zeros(300))
pad_index = word2vec_goog1e_news.key_to_index["<pad>"]
embedding_weights = torch.FloatTensor(word2vec_goog1e_news.vectors)
vocab = word2vec_goog1e_news.key_to_index

def indexify(data):
    setences = []
    for sentence in data:
        s = [vocab[token] if token in vocab
            else vocab['UNK']
            for token in sentence]
        setences.append(s)
    return setences



In [63]:
X_val_tokenized_indexified = indexify(X_val_tokenized)
X_val_tokenized_indexified

[[83, 198, 98307, 59665, 219, 4, 644530, 98307],
 [83, 523, 21, 11, 106, 2685, 13331, 98307],
 [83, 2042, 94, 8782, 44, 521, 17, 98307, 98307, 29, 202515, 185298, 98307],
 [83,
  230950,
  3156,
  2723,
  11,
  2216,
  98307,
  98307,
  88,
  4501,
  37,
  98307,
  2242,
  251,
  98307,
  98307],
 [83, 4, 11, 106, 1141, 1213, 385, 2689, 98307],
 [83, 4, 11, 1567, 171, 119847, 1451, 98307, 84107, 1451, 98307],
 [48, 100, 1088, 11, 2973, 6972, 31336, 98307],
 [117,
  50,
  4501,
  359,
  181,
  5,
  11,
  98307,
  165895,
  98307,
  66,
  10809,
  1,
  98307,
  2530,
  98307],
 [83, 98307, 11, 157, 98307, 106, 941, 5617, 98307],
 [83, 4, 2689, 98307],
 [540, 4, 71204, 98307, 4162, 234, 845, 85, 1119, 98307],
 [31,
  4,
  11,
  89383,
  10610,
  907699,
  31,
  4,
  26424,
  2566,
  2,
  26,
  4694,
  6001,
  98307],
 [83, 638, 22681, 98307],
 [83,
  51856,
  12244,
  3304,
  58,
  98307,
  177,
  14,
  98307,
  98307,
  98307,
  98307,
  98307,
  288276,
  98307],
 [139, 131, 992, 19, 5,