In [None]:
from tools.DataLoader import DataLoader
dataLoader = DataLoader()
df = dataLoader.load_race_data()
race_names = dataLoader.get_label_names(df)
num_races = len(race_names)
df['label'].value_counts()

In [None]:
unlabeled_data = dataLoader.load_unlabeled_data()
# Example usage:
unlabled_data = dataLoader.filter_unlabeled_data(unlabeled_data, df)
unlabeled_data.head()

In [None]:
from tools.DataPreprocessor import DataPreprocessor
dataPreprocessor = DataPreprocessor()
train_df, test_df = dataPreprocessor.sample_and_split_data(df)
X_train, X_test, y_train, y_test = dataPreprocessor.prepare_data(train_df, test_df)

In [None]:
from tools.HierarchicalBertClassifier import HierarchicalBertClassifier
BERT_VERSION_PATH = '/home/saveuser/S/projects/rawan2_project/Python Code/bertbase'
# Initialize the classifier
classifier = HierarchicalBertClassifier(BERT_VERSION_PATH, num_races)


In [5]:
# from tools.BatchActiveLearner import BatchActiveLearner
# from modAL.uncertainty import uncertainty_sampling
# # Initialize the batch active learner
# learner = BatchActiveLearner(
#     estimator=classifier,
#     X_training=X_train,
#     y_training=y_train,
#     query_strategy=uncertainty_sampling
# )

In [None]:
# Initialize and train the classifier
classifier = HierarchicalBertClassifier(BERT_VERSION_PATH, num_races)
# Train on your initial labeled data (sampled_data)
classifier.fit(X_train, y_train)
start_idx = 20480 + 1024

In [None]:
from tools.LabelingTool import LabelingTool
import numpy as np
from tqdm import tqdm
# Setup for processing unlabeled data
n_queries = 10
pbar = tqdm(total=n_queries, desc="Active Learning")
step = 1024

label_type = 'race'
label_names = list(race_names)
labeler = LabelingTool()
with_confidence = True

while pbar.n < n_queries:
    # Get batch of unlabeled data
    # X_pool = unlabeled_data[start_idx:start_idx+step]['text'].values
    X_pool = unlabeled_data['text'].values[np.random.choice(len(unlabeled_data), size=step, replace=False)]

    if len(X_pool) == 0:
        break

    # Get predictions and probabilities for the batch
    print(f'unlabled data query range = [{start_idx},{start_idx+step}]')
    print(f'predicting pool of size = {step}...')
    predictions = classifier.predict(X_pool)
    probabilities = classifier.predict_proba(X_pool)

    # Find instances where race is predicted as present
    print(f'gathering results where race is non-absent...')
    race_present_indices = []
    for idx, (pred, prob) in enumerate(zip(predictions, probabilities)):
        if pred != 'absent':
            print(prob)
            race_present_indices.append({
                'index': start_idx + idx,
                'text': X_pool[idx],
                'predicted_race': pred,
                'confidence': np.max(prob)
            })

    # Sort by confidence (optional)
    race_present_indices.sort(key=lambda x: x['confidence'])

    # Print predictions for review
    print(f'Number of instances found with race present = {len(race_present_indices)}')
    # print("\nPredicted instances with race present:")
    # for idx, instance in enumerate(race_present_indices):
    #     print(f"\nInstance {idx + 1}:")
    #     print(f"Text: {instance['text']}")
    #     print(f"Predicted race: {instance['predicted_race']}")
    #     print(f"Confidence: {instance['confidence']:.3f}")

    # Get indices for labeling
    query_idx = [instance['index'] for instance in race_present_indices]
    print(query_idx)

    if query_idx:  # if there are instances with race present
        # Get labels using LabelingTool
        y = labeler.assign_labels(X_pool, race_present_indices, unlabeled_data, label_type, race_names, with_confidence)

        # Add newly labeled data to training data and retrain the classifier
        new_X = unlabeled_data[query_idx]
        # Retrain Step
        classifier = HierarchicalBertClassifier(BERT_VERSION_PATH, num_races)
        X_train = np.concatenate([X_train, new_X])
        y_train = np.concatenate([y_train, y])
        classifier.fit(X_train, y_train)
        # Remove labeled instances from unlabeled_data
        unlabeled_data = unlabeled_data.drop(unlabeled_data.index[query_idx]).reset_index(drop=True)

    pbar.update(1)
    start_idx = start_idx + step

pbar.close()

In [None]:
label_names