In [None]:
!pip install modAL

In [None]:
!pip install polling

In [None]:
import numpy as np
from sklearn.datasets import load_files
import polling

# Data preparation

In [None]:
data = load_files('datasets/bbc/', encoding="utf-8", decode_error="replace")
# calculate count of each category
labels, counts = np.unique(data.target, return_counts=True)
# convert data.target_names to np array for fancy indexing
labels_str = np.array(data.target_names)[labels]
print(dict(zip(labels_str, counts)))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
list(t[:80] for t in X_train[:10])

# Configure Vectorizer (word to ids)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000, decode_error="ignore")
vectorizer.fit(X_train)

In [43]:
# vectorize training set
X_train_vectorized = vectorizer.transform(X_train)

# Create classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

cls = MultinomialNB()

# Create Active learner

In [None]:
from modAL.models import ActiveLearner
#from modAL.batch import uncertainty_batch_sampling

In [None]:
learner = ActiveLearner(
    estimator=cls,
    #query_strategy=uncertainty_batch_sampling,# this could be used to gather more instances at each timestep
    n_instances=5
    #X_training=X_train_vectorized[0:10], y_training=y_train[0:10] # this could be used to bootstrap the model
)

# Setup rubrix

In [1]:
import rubrix
from rubrix.sdk.models import *
from rubrix.sdk.api.text_classification import bulk_records, search_records

In [None]:
api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJAcmVjb2duYWkiLCJleHAiOjE2MTQ0NTgzNjl9.PlS29RTTrPMKz0FIWO4Qwk_9U_i1q5ZC_OVHbDqRIaU"

In [None]:
cli = rubrix.init(token=api_key)

# Active learning loop

In [None]:
# query for labels
# active learning
X_pool = X_test
n_queries = 10
for idx in range(n_queries):
    
    # Vectorize pool examples
    vecs = vectorizer.transform(X_pool)
    
    # Query for uncertain examples
    query_idx, query_inst = learner.query(vecs) # we could retrieve and log more each time with n_instances=
    
    # build rubrix records
    uncertain_records = []
    for idx in query_idx:
        predictions = learner.predict_proba(vecs[idx])[0] # not need but might be useful to log
        uncertain_records.append(TextClassificationRecord.from_dict({
            "id":  int(idx),
            "inputs": {"text": str(X_pool[idx])},
            "prediction": {"agent": "active_learner", "labels": [{'class': labels_str[j], 'confidence': proba} for j,proba in enumerate(predictions)]} # this is not needed but can help the user
        }))
        
    # log query records
    rubrix.log(uncertain_records, dataset="active_learning_example")
    
    # TODO: we need to wait a couple of secs, otherwise index is not updated
    
    # poll rubrix until no records left to annotate
    polling.poll(
        lambda: search_records.sync(
            client=cli, 
            dataset_id="active_learning_example", 
            json_body=TextClassificationSearchRequest.from_dict({"query": {"status": ["Default"]}})
        ).total == 0,
        step=10,
        poll_forever=True
    )
    # TODO: now get the annotated labels and teach the learner
    results = search_records.sync(
            client=cli, 
            dataset_id="active_learning_example", 
            json_body=TextClassificationSearchRequest.from_dict({"query": {"status": ["Validated"]}})
    )
    learner.teach(vecs[query_idx], y_train[query_idx])
    
    # remove examples from the pool
    X_pool = np.delete(X_pool, query_idx)