### Reading Data

In [1]:
import pandas as pd

train_df = pd.read_csv("new_train.csv", index_col=0)
test_df = pd.read_csv("new_test.csv", index_col=0)

print("Train size", len(train_df))
print("Test size", len(test_df))
train_df.head(n=3)

Train size 3969
Test size 997


Unnamed: 0,medical_specialty,transcription,labels
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1


### Train Set Label Distribution

In [2]:
train_df["medical_specialty"].value_counts()

 Surgery                          863
 Consult - History and Phy.       410
 Cardiovascular / Pulmonary       309
 Orthopedic                       289
 Radiology                        213
 General Medicine                 209
 Gastroenterology                 176
 Neurology                        170
 SOAP / Chart / Progress Notes    135
 Urology                          134
 Obstetrics / Gynecology          123
 Discharge Summary                 87
 ENT - Otolaryngology              82
 Neurosurgery                      71
 Hematology - Oncology             68
 Ophthalmology                     67
 Emergency Room Reports            63
 Nephrology                        63
 Pediatrics - Neonatal             55
 Pain Management                   54
 Psychiatry / Psychology           45
 Office Notes                      38
 Podiatry                          35
 Dermatology                       21
 Dentistry                         21
 Cosmetic / Plastic Surgery        19
 Letters    

### Sample Transcription

In [3]:
from pprint import pprint
pprint(train_df.transcription[0])

('REASON FOR THE VISIT:,  Very high PT/INR.,HISTORY: , The patient is an '
 '81-year-old lady whom I met last month when she came in with pneumonia and '
 'CHF.  She was noticed to be in atrial fibrillation, which is a chronic '
 'problem for her.  She did not want to have Coumadin started because she said '
 'that she has had it before and the INR has had been very difficult to '
 'regulate to the point that it was dangerous, but I convinced her to restart '
 'the Coumadin again.  I gave her the Coumadin as an outpatient and then the '
 'INR was found to be 12.  So, I told her to come to the emergency room to get '
 'vitamin K to reverse the anticoagulation.,PAST MEDICAL HISTORY:,1.  '
 'Congestive heart failure.,2.  Renal insufficiency.,3.  Coronary artery '
 'disease.,4.  Atrial fibrillation.,5.  COPD.,6.  Recent pneumonia.,7.  '
 'Bladder cancer.,8.  History of ruptured colon.,9.  Myocardial '
 'infarction.,10.  Hernia repair.,11.  Colon resection.,12.  Carpal tunnel '
 'repair.,13

In [4]:
from platform import python_version

print(python_version())

3.9.13


### Sample Training

In [5]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from torch import nn
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [6]:
unique_classes = train_df["medical_specialty"].unique()

idx_2_class = {i: s for i, s in enumerate(unique_classes)}
class_2_idx = {s: i for i, s in enumerate(unique_classes)}

In [7]:
train_df["labels"] = train_df["medical_specialty"].apply(lambda s: class_2_idx[s])

In [8]:
train_train_df, train_test_df = \
    train_test_split(
    train_df,
    test_size=0.3,
    random_state=42
)

In [9]:
ds_dict = {
    'train': Dataset.from_pandas(train_train_df),
    'val': Dataset.from_pandas(train_test_df),
    "test": Dataset.from_pandas(test_df)
}

ds = DatasetDict(ds_dict)

In [10]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_text(texts):
    return tokenizer(texts["transcription"], truncation=True, padding=True, max_length=256)

ds["train"] = ds["train"].map(tokenize_text, batched=True)
ds["val"] = ds["val"].map(tokenize_text, batched=True)
ds["test"] = ds["test"].map(tokenize_text, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_classes)
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

### Evaluation Metric

In [12]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    return {"f1": f1}

In [13]:
batch_size = 32
logging_steps = len(train_train_df) // batch_size
output_dir = "hf_trainer"

training_args = TrainingArguments(
    output_dir=output_dir,
     num_train_epochs=5,
     learning_rate=2e-5,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     weight_decay=0.01,
     evaluation_strategy="epoch",
     logging_steps=logging_steps,
     push_to_hub=False
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds['train'],
    eval_dataset=ds['val'],
    tokenizer=tokenizer
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: medical_specialty, transcription, __index_level_0__. If medical_specialty, transcription, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2778
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 435
  Number of trainable parameters = 66984232
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

### Making Inference on the Test Set

In [None]:
ds["test"]

In [None]:
pred_y = trainer.predict(ds["test"])

In [None]:
a = pd.Series(pred_y.predictions.argmax(axis=1))
a.name = "Expected"
a.to_csv("predictions.csv")

### Normalize Dataset

In [None]:
import pandas as pd
# from pathlib import Path

In [None]:
train_df = pd.read_csv(
    'new_train.csv',
    encoding='utf-8', 
    engine='python'
)

In [None]:
train_df.head()


In [None]:
# Remove special characters (non alphabetical char)
import re
# mystring = "      Ballenger       Center            Dr # 351     "
# mystring = re.sub('[^A-Za-z0-9]+ ', ' ', mystring).strip()
# mystring
# df['a'] = df['a'].apply(lambda x: x + 1)
# bank_accounts['Address Line 1'] = bank_accounts['Address Line 1'].apply(lambda x: re.sub('[^A-Za-z0-9]+ ', ' ', x).strip())
# bank_accounts.head()
train_df['transcription'] = train_df['transcription'].apply(lambda x: re.sub('[^A-Za-z]+ ', ' ', x).strip())
# train_df['transcription'] = train_df['transcription'].apply(lambda x: re.sub('the ', '', x).strip())
# train_df['transcription'] = train_df['transcription'].apply(lambda x: re.sub('and ', '', x).strip())
# train_df['transcription'] = train_df['transcription'].apply(lambda x: re.sub('is ', '', x).strip())
train_df.head()

In [None]:
from spellchecker import SpellChecker
# load in the spell checker library
spell = SpellChecker()

# function to check the spelling of a given text
def check_spelling(text):
    # convert the text to lowercase for comparison
    text = text.lower()

    # this is where we will store all the words with the wrong words corrected
    corrected_text = []

    # correct each word in the text and add it to corrected_text
    for word in text.split():
        # spell.correction corrects the spelling of a given word by comparing it to a dictionary of correctly spelt words
        corrected_word = spell.correction(word)
        corrected_text.append(corrected_word)
    
    # join together the words in the corrected_text array into a singular string and seperate each word with a space
    text = " ".join(corrected_text)

    return text

In [19]:
# Remove stopwords & Tokenization
# https://levelup.gitconnected.com/how-to-remove-stopwords-from-text-in-python-9e9fbfcbca8d
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# stopwords.words('english')

# # my_txt = "I'm George. I live in Athens! This is my blog, hopefully you enjoy this post! Look at this!"
# filtered_list = []
# stop_words = nltk.corpus.stopwords.words('english')
# # Tokenize the sentence
# # words = word_tokenize(my_txt)
# words = word_tokenize(train_df['transcription'])
# for w in words:
#     if w.lower() not in stop_words:
#         filtered_list.append(w)
        
# filtered_list
# my_clean_txt = " ".join(filtered_list)
# my_clean_txt


# stop_words = set(stopwords.words('english')) 
# file1 = train_df['transcription']
  
# # Use this to read file content as a stream: 
# # line = file1.read()
# words = line.split() 
# for r in words: 
#     if not r in stop_words: 
#         appendFile = open('filteredtext.txt','a') 
#         appendFile.write(" "+r) 
#         appendFile.close() 


import io 
import codecs
import csv
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english')) 
file1 = codecs.open('new_train.csv','r','utf-8') 
line = file1.read() 
words = line.split()
print("poop")
appendFile = open('stopwords_train.csv','a', encoding='utf-8') 
for r in words: 
    if not r in stop_words: 
        appendFile.write(r)
    appendFile.write("\n")
appendFile.close()

poop


In [None]:
# Tokenization
# import csv,numpy 
# from nltk import sent_tokenize, word_tokenize, pos_tag
# reader = csv.reader(open('Medium_Edited.csv', 'rU'), delimiter= ",",quotechar='|')
# tokenData = word_tokenize(reader)

# with open("Medium_Edited.csv", "rU") as csvfile:
# tokenData = nltk.word_tokenize(str(reader))

# # for line in reader:
# #     for field in line:
# #         tokens = word_tokenize(field)
