In [1]:
import pandas as pd
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = './data/csv_files/'
all_files = glob.glob(f"{path}/*.csv")

In [3]:
all_files[1]

'./data/csv_files\\ActivityDefinition.csv'

In [4]:
# For all files in the path
# Create a consolidated dataframe
all_resources = pd.DataFrame()

for filename in all_files:
    # Read the file
    df = pd.read_csv(filename, index_col=None, header=0)
    # Get the name of the file
    file_name = filename.split('\\')[-1].split('.')[0]
    # Add a column with the name of the file
    df['resource'] = file_name
    # Append the dataframe to the consolidated dataframe
    all_resources = all_resources.append(df, ignore_index=True)
    # Drop the Unnamed column
    all_resources.drop(columns=['Unnamed: 0'], inplace=True)
# Save the consolidated dataframe
all_resources.to_csv('./data/all_resources.csv', index=False)

In [5]:
# Show 5 rows per resource
all_resources.groupby('resource').tail(5)


Unnamed: 0,Path,Name,Flag,Card,Type,Description,resource
12,Account.guarantor,guarantor,,0..*,BackboneElement,The parties ultimately responsible for balanci...,Account
13,Account.guarantor.party,party,,1..1,Reference(Patient | RelatedPerson | Organization),Responsible entity,Account
14,Account.guarantor.onHold,onHold,,0..1,boolean,Credit or other hold applied,Account
15,Account.guarantor.period,period,,0..1,Period,Guarantee account during,Account
16,Account.partOf,partOf,,0..1,Reference(Account),Reference to a parent Account,Account
...,...,...,...,...,...,...,...
4921,VisionPrescription.lensSpecification.diameter,diameter,,0..1,decimal,Contact lens diameter,VisionPrescription
4922,VisionPrescription.lensSpecification.duration,duration,,0..1,SimpleQuantity,Lens wear duration,VisionPrescription
4923,VisionPrescription.lensSpecification.color,color,,0..1,string,Color required,VisionPrescription
4924,VisionPrescription.lensSpecification.brand,brand,,0..1,string,Brand required,VisionPrescription


In [6]:
all_resources['resource'].nunique()

146

In [7]:
# Groupby the resource column and turn the name of the resource into a list and Description into a list
all_resources_grouped = all_resources.groupby('resource')['Name', 'Description'].agg(lambda x: list(x)).reset_index()

In [8]:
all_resources_grouped

Unnamed: 0,resource,Name,Description
0,Account,"[ Account , identifier , status , type , n...","[Tracks balance, charges, for patient or cost ..."
1,ActivityDefinition,"[ ActivityDefinition , url , identifier , v...",[The definition of a specific activity to be t...
2,AdverseEvent,"[ AdverseEvent , identifier , actuality , c...","[Medical care, research study or other healthc..."
3,AllergyIntolerance,"[ AllergyIntolerance , identifier , clinical...",[Allergy or Intolerance (generally: Risk of ad...
4,Appointment,"[ Appointment , identifier , status , cance...",[A booking of a healthcare event among patient...
...,...,...,...
141,TestReport,"[ TestReport , identifier , name , status ,...",[Describes the results of a TestScript executi...
142,TestScript,"[ TestScript , url , identifier , version ,...",[Describes a set of tests+ Warning: Name shoul...
143,ValueSet,"[ ValueSet , url , identifier , version , ...",[A set of codes drawn from one or more code sy...
144,VerificationResult,"[ VerificationResult , target , targetLocati...","[Describes validation requirements, source(s),..."


In [9]:
all_resources[all_resources.Name.str.contains("account")]

Unnamed: 0,Path,Name,Flag,Card,Type,Description,resource
478,ChargeItem.account,account,Σ,0..*,Reference(Account),Account to place this charge,ChargeItem
1600,Encounter.account,account,,0..*,Reference(Account),The set of accounts that may be used for billi...,Encounter
1664,EpisodeOfCare.account,account,,0..*,Reference(Account),The set of accounts that may be used for billi...,EpisodeOfCare
2445,Invoice.account,account,,0..1,Reference(Account),Account that is being balanced,Invoice


## Train a Few Shot Text Classification using SetFit

In [10]:
from datasets import Dataset

# Create a dataset from the dataframe
dataset = Dataset.from_pandas(all_resources)
# Concatenate the Name and Description columns
dataset = dataset.map(lambda x: {'text': str(x['Name']) + ' ' + str(x['Description'])})
# Rename the resource column to label
dataset = dataset.rename_column('resource', 'label')
# Convert the label and text to lowercase
dataset = dataset.map(lambda x: {'label': x['label'].lower(), 'text': x['text'].lower()})
# Remove the columns ['Path', 'Name', 'Flag', 'Card', 'Type', 'Description']
dataset = dataset.remove_columns(['Path', 'Name', 'Flag', 'Card', 'Type', 'Description'])

  0%|          | 0/4926 [00:00<?, ?ex/s]

  0%|          | 0/4926 [00:00<?, ?ex/s]

In [11]:
# Show 5 rows of the dataset
dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 4926
})

In [12]:
# Split the dataset into train and test splits
train_test_dataset = dataset.train_test_split(test_size=0.2)
train_test_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 3940
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 986
    })
})

In [13]:
train_test_dataset['train']

Dataset({
    features: ['label', 'text'],
    num_rows: 3940
})

In [14]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [15]:
# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_test_dataset['train'],
    eval_dataset=train_test_dataset['test'],
    loss_class=CosineSimilarityLoss,
    metric=["accuracy","f1"],
    batch_size=4,
    num_iterations=8, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for constrastive learning
    column_mapping={"text": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)

In [16]:
# Train and evaluate
trainer.train()
metrics = trainer.evaluate()

Applying column mapping to training dataset
***** Running training *****
  Num examples = 63040
  Num epochs = 1
  Total optimization steps = 15760
  Total train batch size = 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/15760 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset


AttributeError: 'list' object has no attribute 'replace'

In [18]:
trainer.evaluate()

Applying column mapping to evaluation dataset


AttributeError: 'list' object has no attribute 'replace'