# Install prerequisities

In [None]:
! pip install datasets
! pip install huggingface_hub
! pip install -U sentence-transformers

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:

In [None]:
! huggingface-cli login --token YOUR_HF_TOKEN --add-to-git-credential
from datasets import load_dataset
import pandas as pd

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Load and filter data

In [None]:
dataset = load_dataset("DataFog/medical-transcription-instruct")

README.md:   0%|          | 0.00/3.16k [00:00<?, ?B/s]

(…)tafog-medical-transcription-instruct.csv:   0%|          | 0.00/138M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38924 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'task_output', 'transcription', 'description', 'medical_specialty', 'sample_name', 'keywords', 'derived_keywords', 'transcription_length', 'normalized_length', 'complexity_score'],
        num_rows: 38924
    })
})

In [None]:
dataset['train'] = dataset['train'].remove_columns(['instruction', 'task_output'])


In [None]:
df = pd.DataFrame(dataset['train'])

In [None]:
df['num_tokens'] = df['transcription'].apply(lambda text: len(text.split()))

In [None]:
df_cleaned = df.drop_duplicates(subset=['transcription'])

In [None]:
len(df_cleaned)

2358

In [None]:
df_cleaned = df.drop_duplicates(subset=['sample_name'])

In [None]:
df_cleaned.medical_specialty.describe()

Unnamed: 0,medical_specialty
count,2377
unique,39
top,Surgery
freq,989


In [None]:
train_df= df_cleaned[df_cleaned['description'].str.len() >= 50]


In [None]:
len(train_df)

2034

In [None]:
train_df.num_tokens.describe()

Unnamed: 0,num_tokens
count,2034.0
mean,20.253196
std,11.413323
min,4.0
25%,11.0
50%,17.0
75%,27.0
max,76.0


# Data split

In [None]:
from sklearn.model_selection import train_test_split
X = train_df['description']
y = train_df['medical_specialty']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Embed with the model

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
print("Generating embeddings for training data...")
X_train_embeddings = model.encode(X_train.to_list(), batch_size=64, show_progress_bar=True)

print("Generating embeddings for test data...")
X_test_embeddings = model.encode(X_test.to_list(), batch_size=64, show_progress_bar=True)


Generating embeddings for training data...


Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Generating embeddings for test data...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

# Train logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

clf = LogisticRegression(random_state=42, max_iter=100)
clf.fit(X_train_embeddings, y_train)

y_pred = clf.predict(X_test_embeddings)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.6413
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.00      0.00      0.00         3
    Consult - History and Phy.       0.67      0.25      0.36        16
                   Dermatology       0.00      0.00      0.00         1
             Discharge Summary       0.00      0.00      0.00         4
        Emergency Room Reports       0.00      0.00      0.00         2
              Gastroenterology       0.00      0.00      0.00         3
              General Medicine       0.30      0.43      0.36        30
         Hematology - Oncology       0.00      0.00      0.00         5
     Hospice - Palliative Care       0.00      0.00      0.00         1
        IME-QME-Work Comp etc.       0.00      0.00      0.00         3
                       Letters       0.00      0.00      0.00         1
                    Nephrology       1.00      0.25      0.40         4
                     Neurology       0.60     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
