In [3]:
import pandas as pd
from google.colab import drive
import re
import torch
import warnings
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [4]:
drive.mount('/content/drive') # mount

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/MyDrive/Augnito/mtsamples.csv') # read the csv file
print(df.head())  #print the dataset


   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                             2-D M-Mode. Doppler.     
4           4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL 

In [6]:
print(df.shape) # print shape

(4999, 6)


In [7]:
df = df.loc[:, ["transcription", "keywords"]] # Filter the columns
df.head()

Unnamed: 0,transcription,keywords
0,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [8]:
df= df.dropna(subset=['transcription', 'keywords']) # remove null values
df.shape

(3898, 2)

In [9]:
df = df.drop_duplicates(subset=['transcription', 'keywords']) # remove duplicates rows
df.shape


(3852, 2)

In [10]:
def clean_text(text):
    return re.sub(r'[^a-z0-9\s]', '', text.lower()) # remove special character and convert into lowercase

In [11]:
df[['transcription', 'keywords']] = df[['transcription', 'keywords']].applymap(clean_text)
print(df.head()) # print cleaned data

                                       transcription  \
0  subjective  this 23yearold white female presen...   
1  past medical history he has difficulty climbin...   
2  history of present illness  i have seen abc to...   
3  2d mmode  1  left atrial enlargement with left...   
4  1  the left ventricular cavity size and wall t...   

                                            keywords  
0  allergy  immunology allergic rhinitis allergie...  
1  bariatrics laparoscopic gastric bypass weight ...  
2  bariatrics laparoscopic gastric bypass heart a...  
3  cardiovascular  pulmonary 2d mmode doppler aor...  
4  cardiovascular  pulmonary 2d doppler echocardi...  


In [12]:
!pip install sentence-transformers transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m51.2/86.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_trans

In [13]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
import torch

In [14]:
model = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
if torch.cuda.is_available():
  model = model.to('cuda')

In [16]:
class CustomDataset(Dataset):
    def __init__(self, df, transcript, keywords, tokenizer):
        self.df = df
        self.transcript = transcript
        self.keywords = keywords
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        transcript_text = self.df.loc[idx, self.transcript]
        keywords_text = self.df.loc[idx, self.keywords]

        encoding_transcript = self.tokenizer(transcript_text, padding='max_length', truncation=True, return_tensors='pt')
        encoding_keywords = self.tokenizer(keywords_text, padding='max_length', truncation=True, return_tensors='pt')

        transcript_tokens = encoding_transcript['input_ids'][0]
        keywords_tokens = encoding_keywords['input_ids'][0]

        if torch.cuda.is_available():
            transcript_tokens = transcript_tokens.to('cuda')
            keywords_tokens = keywords_tokens.to('cuda')

        return transcript_tokens, keywords_tokens


In [17]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size=0.9, random_state=32)
train_data = CustomDataset(df_train, 'transcription', 'keywords', tokenizer)
test_data = CustomDataset(df_test, 'transcription', 'keywords', tokenizer)
batch_size = 5

In [18]:
train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)