In [1]:
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import wandb
import yaml

Setting up location variables

In [2]:
working_dir = "/home/saradindu/dev/mlops_pipeline_flair/"
with open(f"{working_dir}/config/config.yaml", 'r') as file:
    config = yaml.safe_load(file)

Creating a corpus and generating label dictionary

In [6]:
column_name_map = {0: "text", 1: "label"}
corpus: Corpus = CSVClassificationCorpus(data_folder = f'{working_dir}/data',
                                         train_file = 'train_small.csv',
                                         dev_file = 'dev_small.csv',
                                         test_file = 'test_small.csv',
                                         column_name_map=column_name_map,
                                         skip_header=True,
                                         delimiter=',',
                                         label_type='label')
label_type = 'label'
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)
trainer = ModelTrainer(classifier, corpus)

2024-02-15 01:49:23,488 Reading data from /home/saradindu/dev/mlops_pipeline_flair/data
2024-02-15 01:49:23,491 Train: /home/saradindu/dev/mlops_pipeline_flair/data/train_small.csv
2024-02-15 01:49:23,492 Dev: /home/saradindu/dev/mlops_pipeline_flair/data/dev_small.csv
2024-02-15 01:49:23,492 Test: /home/saradindu/dev/mlops_pipeline_flair/data/test_small.csv
2024-02-15 01:49:23,904 Computing label dictionary. Progress:


0it [00:00, ?it/s]
100000it [00:13, 7318.96it/s]

2024-02-15 01:49:37,581 Dictionary created for label 'label' with 2 values: 0 (seen 66523 times), 1 (seen 33477 times)





Dictionary with 2 tags: 0, 1


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Initializing Weights and Biases instance for model metrcis logging and monitoring

Also fine-tuning document embeddings

In [None]:
wandb.init(
    # set the wandb project where this run will be logged
    project="mlops_pipeline_flair",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": config['fine_tuning']['learning_rate'],
    "architecture": "TransformerDocumentEmbeddings",
    "dataset": "Custom",
    "epochs": config['fine_tuning']['max_epochs'],
    }
)
trainer.fine_tune(f'{working_dir}/model',
                  learning_rate=config['fine_tuning']['learning_rate'],
                  mini_batch_size=config['fine_tuning']['mini_batch_size'],
                  max_epochs=config['fine_tuning']['max_epochs'])
wandb.finish()

Logging the trained model as model artifact in model registry in Weights and Biases

In [7]:
wandb.init(project="mlops_pipeline_flair")
art = wandb.Artifact("flair_text_classifier", type="model")
art.add_file(f"{working_dir}/model/final-model.pt")
wandb.log_artifact(art)



Inferencing

In [3]:
from flair.data import Sentence
classifier = TextClassifier.load(f"{working_dir}model/final-model.pt")
sentence = Sentence("string")
classifier.predict(sentence)
sentence.labels

['Sentence[1]: "string"'/'0' (0.5604)]

In [2]:
from flair.models import TextClassifier
from flair.data import Sentence
import os

stage = os.getenv("STAGE")
working_dir = "/root/"
if(stage == "dev"):
    working_dir = "/home/saradindu/dev/mlops_pipeline_flair/"

In [22]:
def classify_text(text):

    """
    A small function to classify the incoming string.
    ------------------------
    Params:
    classifier: The loaded model object.
    sentence: A string to classify.
    ------------------------
    Output:
    A list of tuples containing labels & probabilities.
    """
    classifier = TextClassifier.load(f"{working_dir}model/final-model.pt")
    sentence = Sentence(text)
    classifier.predict(sentence)
    return sentence

In [23]:
r = classify_text("string")

In [35]:
t = r.to_dict()

In [None]:
t.keys