In [None]:
! pip install ipykernel
! pip install -U pip setuptools wheel
! pip install -U spacy[transformers, lookups]==3.0.3
! python -m spacy download en_core_web_trf
! pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 torchaudio==0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch lts.html
! pip install cupy-cuda113
! pip install scispacy
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz

In [None]:
import pandas as pd
import spacy
import scispacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import time
import spacy
from spacy.tokens import Doc
from sklearn.metrics import precision_recall_fscore_support, classification_report

# Load the CSV data
data = pd.read_csv('https://github.com/uml-digital-health/Labs/blob/main/Project3/data/train.csv')

# Convert the CSV data into training examples
examples = []
for _, row in data.iterrows():
    start, end = int(row['start']), int(row['end'])
    entity_type = row['sbdh']
    text = row['text']
    example = Example.from_dict(
        spacy.blank("en"), 
        {"text": text, "entities": [(start, end, entity_type)]}
    )
    examples.append(example)

# Load the scibert model
nlp = spacy.blank("en")
nlp.add_pipe("scibert", config={"model": "scibert_scivocab_uncased"})

# Add the entity labels
entity_labels = set([example['entities'][0][2] for example in examples])
for label in entity_labels:
    nlp.entity.add_label(label)

# Train the model
n_iter = 100
batch_size = 4
nlp.begin_training()
for i in range(n_iter):
    losses = {}
    random.shuffle(examples)
    batches = minibatch(examples, size=compounding(batch_size, batch_size*2, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.5, losses=losses)

# Save the model
nlp.to_disk('my_ner_model')

# Use the model to process input text and generate NER tags
doc = nlp("Some example text that contains a tobacco-related behavior.")
for ent in doc.ents:
    print(ent.text, ent.label_)


TO EVALUATE THE MODEL

In [None]:


# Load the trained model
nlp = spacy.load('my_ner_model')

# Define the evaluation function
def evaluate_model(docs):
    start_time = time.time()
    true_labels = []
    pred_labels = []
    for doc in docs:
        true_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        doc = nlp(doc.text)
        pred_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        true_labels.extend(true_ents)
        pred_labels.extend(pred_ents)
    # Calculate the evaluation metrics
    strict_metrics = precision_recall_fscore_support(true_labels, pred_labels, average='binary')
    partial_metrics = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')
    # Calculate the processing time
    processing_time = (time.time() - start_time) / len(docs)
    return strict_metrics, partial_metrics, processing_time

# Load the test data
test_data = pd.read_csv('https://github.com/uml-digital-health/Labs/blob/main/Project3/data/test.csv')

# Convert the test data into Doc objects
docs = [Doc(nlp.vocab, text=row['text']) for _, row in test_data.iterrows()]

# Evaluate the model
strict_metrics, partial_metrics, processing_time = evaluate_model(docs)

# Print the evaluation results
print('Strict matching:\nPrecision: {}\nRecall: {}\nF1 score: {}'.format(strict_metrics[0], strict_metrics[1], strict_metrics[2]))
print('Partial matching:\nPrecision: {}\nRecall: {}\nF1 score: {}'.format(partial_metrics[0], partial_metrics[1], partial_metrics[2]))
print('Processing time per document: {} seconds'.format(processing_time))