In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm


In [2]:
with open('train_test_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train_full = data['X_train']
X_test = data['X_test']
y_train_full = data['y_train']
y_test = data['y_test']

In [3]:
from sklearn.model_selection import train_test_split

X_labeled, X_pool, y_labeled, y_pool = train_test_split(
    X_train_full, y_train_full,
    train_size=0.1,
    random_state=42,
    stratify=y_train_full
)

In [5]:
labeled_indices = list(range(len(y_labeled)))
pool_indices = list(range(len(y_pool)))
BATCH_SIZE = 50  
N_ITERATIONS = 20  

In [6]:
def uncertainty_sampling(model, X_pool, n_samples=50):
    
   
    probas = model.predict_proba(X_pool)
    
    uncertainties = 1 - np.max(probas, axis=1)
    
    
    uncertain_indices = np.argsort(uncertainties)[-n_samples:][::-1]
    
    return uncertain_indices


In [7]:
al_metrics = {
    'iteration': [],
    'n_labeled': [],
    'accuracy': [],
    'f1': []
}


In [9]:
from scipy.sparse import vstack

for iteration in tqdm(range(N_ITERATIONS), desc="      Active Learning"):

    model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
    model.fit(X_labeled, y_labeled)
    
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
   
    al_metrics['iteration'].append(iteration)
    al_metrics['n_labeled'].append(len(y_labeled))
    al_metrics['accuracy'].append(accuracy)
    al_metrics['f1'].append(f1)
    
    
    if len(y_pool) < BATCH_SIZE:
        print(f"\n      Pool exhausted at iteration {iteration}")
        break
    
    uncertain_indices = uncertainty_sampling(model, X_pool, n_samples=BATCH_SIZE)
    
    
    for idx in uncertain_indices:
       
        if X_labeled is None:
            X_labeled = X_pool[idx]
            y_labeled = np.array([y_pool[idx]])
        else:
            X_labeled = vstack([X_labeled, X_pool[idx]])
            y_labeled = np.append(y_labeled, y_pool[idx])
    
    
    for idx in sorted(uncertain_indices, reverse=True):
        X_pool = vstack([X_pool[:idx], X_pool[idx+1:]])
        y_pool = np.delete(y_pool, idx)

      Active Learning: 100%|██████████| 20/20 [00:27<00:00,  1.39s/it]


In [10]:
X_labeled_random, X_pool_random, y_labeled_random, y_pool_random = train_test_split(
    X_train_full, y_train_full,
    train_size=0.1,
    random_state=42,
    stratify=y_train_full
)

random_metrics = {
    'iteration': [],
    'n_labeled': [],
    'accuracy': [],
    'f1': []
}

In [11]:
for iteration in tqdm(range(N_ITERATIONS), desc="      Random Sampling"):
    
    
    model_random = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
    model_random.fit(X_labeled_random, y_labeled_random)
    
  
    y_pred_random = model_random.predict(X_test)
    accuracy_random = accuracy_score(y_test, y_pred_random)
    f1_random = f1_score(y_test, y_pred_random, average='weighted')
    
    # Store
    random_metrics['iteration'].append(iteration)
    random_metrics['n_labeled'].append(len(y_labeled_random))
    random_metrics['accuracy'].append(accuracy_random)
    random_metrics['f1'].append(f1_random)
    
    
    if len(y_pool_random) < BATCH_SIZE:
        break
    
    random_indices = np.random.choice(len(y_pool_random), size=BATCH_SIZE, replace=False)
    
    
    for idx in random_indices:
        X_labeled_random = vstack([X_labeled_random, X_pool_random[idx]])
        y_labeled_random = np.append(y_labeled_random, y_pool_random[idx])
    
  
    for idx in sorted(random_indices, reverse=True):
        X_pool_random = vstack([X_pool_random[:idx], X_pool_random[idx+1:]])
        y_pool_random = np.delete(y_pool_random, idx)



      Random Sampling: 100%|██████████| 20/20 [00:29<00:00,  1.45s/it]


In [13]:
al_df = pd.DataFrame(al_metrics)
random_df = pd.DataFrame(random_metrics)

final_al_acc = al_df['accuracy'].iloc[-1] * 100
final_random_acc = random_df['accuracy'].iloc[-1] * 100
improvement = final_al_acc - final_random_acc


In [14]:
results = {
    'active_learning': al_df,
    'random_sampling': random_df,
    'final_model': model,
    'improvement': improvement
}

with open('active_learning_results.pkl', 'wb') as f:
    pickle.dump(results, f)