In [None]:
from tools.DataLoader import DataLoader
dataLoader = DataLoader()
df = dataLoader.load_race_data()
race_names = dataLoader.get_label_names(df)
num_races = len(race_names)
df['label'].value_counts()

In [None]:
unlabeled_data = dataLoader.load_unlabeled_data()
# Example usage:
unlabled_data = dataLoader.filter_unlabeled_data(unlabeled_data, df)
unlabeled_data.head()

In [None]:
from tools.DataPreprocessor import DataPreprocessor
dataPreprocessor = DataPreprocessor()
train_df, test_df = dataPreprocessor.sample_and_split_data(df)
X_train, X_test, y_train, y_test = dataPreprocessor.prepare_data(train_df, test_df)

In [None]:
from tools.HierarchicalBertClassifier import HierarchicalBertClassifier
BERT_VERSION_PATH = '/home/saveuser/S/projects/rawan2_project/Python Code/bertbase'
# Initialize the classifier
classifier = HierarchicalBertClassifier(BERT_VERSION_PATH, num_races)


In [None]:
from tools.BatchActiveLearner import BatchActiveLearner
from modAL.uncertainty import uncertainty_sampling
# Initialize the batch active learner
learner = BatchActiveLearner(
    estimator=classifier,
    X_training=X_train,
    y_training=y_train,
    query_strategy=uncertainty_sampling
)

In [None]:
from tools.LabelingTool import LabelingTool
import numpy as np
from tqdm import tqdm

# Active learning loop
n_queries = 10
pbar = tqdm(total=n_queries, desc="Active Learning")

# Prepare unlabeled data
start_idx = 24576
step = 1024 * 4
X_pool = unlabeled_data[start_idx:start_idx+step]['text'].values
label_type = 'race'
label_names = list(race_names)
labeler = LabelingTool()


while len(X_pool) > 0 and pbar.n < n_queries:
    X_pool = unlabeled_data[start_idx:start_idx+step]['text'].values
    # query_idx = learner.query(X_pool)
    query_idx = learner.query(X_pool)
    # print(query_idx)
    query_instances = [X_pool[idx] for idx in query_idx]
    # print(f"query_idx = {query_idx},\nquery_instances = {query_instances}")
    y = labeler.assign_labels(X_pool, query_idx, unlabeled_data, label_type, race_names)

    learner.teach(X=query_instances, y=y)

    # Remove the queried instance from the pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    # Remove the corresponding row from unlabeled_data
    unlabeled_data = unlabeled_data.drop(unlabeled_data.index[query_idx]).reset_index(drop=True)

    pbar.update(1)
    start_idx = start_idx + step
    
pbar.close()

# Make predictions on the test set
print("Making final predictions on test set...")
predictions = learner.predict(X_test)

In [None]:
label_names