In [15]:
# -*- coding: utf-8 -*-
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from collections import Counter
import matplotlib as mpl
from itertools import chain
import ast
from collections import Counter
import re

warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(2023)

# Read recipe inputs
df_train = pd.read_csv('new_train.csv')
df_test = pd.read_csv('new_test.csv')

# Setting
mpl.rcParams['text.color'] = 'k'
mpl.rcParams['xtick.color'] = 'k'
mpl.rcParams['ytick.color'] = 'k'
mpl.rcParams['axes.labelcolor'] = 'k'

# Variables
label_txt_col = 'medical_specialty'
label_col = 'labels'
txt_col = 'transcription'
drop_cols = ['Unnamed: 0']
trans_wordbag = 'transcription_words'
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
label_to_medical_specialty_dict = dict(df_train[['labels', 'medical_specialty']].value_counts().index)

In [16]:
# Rename index columns
df_train['idx'] = df_train[drop_cols[0]]
df_test['idx'] = df_test[drop_cols[0]]
df_train = df_train.drop(columns = drop_cols, axis = 1)
df_test = df_test.drop(columns = drop_cols, axis = 1)


In [17]:
# Process transcription length & word count
df_train['transcription_len'] = df_train['transcription'].str.len()
df_train['transcription_word_count'] = df_train['transcription'].str.split().apply(len)
df_test['transcription_len'] = df_test['transcription'].str.len()
df_test['transcription_word_count'] = df_test['transcription'].str.split().apply(len)

df_train['processed_transcription'] = df_train['transcription'].str.lower()
df_test['processed_transcription'] = df_test['transcription'].str.lower()

df_train['processed_transcription'] = df_train['processed_transcription'].apply(lambda x : ' '.join([word for word in x.split() if not word in stopwords]))
df_test['processed_transcription'] = df_test['processed_transcription'].apply(lambda x : ' '.join([word for word in x.split() if not word in stopwords]))

df_train['processed_transcription'] = df_train['processed_transcription'].str.replace('[^a-z0-9 ]', ' ').str.replace('[ ]+', ' ').str.replace('\d+', 'N')
df_test['processed_transcription'] = df_test['processed_transcription'].str.replace('[^a-z0-9 ]', ' ').str.replace('[ ]+', ' ').str.replace('\d+', 'N')

df_train['processed_transcription_len'] = df_train['processed_transcription'].str.len()
df_train['processed_transcription_word_count'] = df_train['processed_transcription'].str.split().apply(len)
df_test['processed_transcription_len'] = df_test['processed_transcription'].str.len()
df_test['processed_transcription_word_count'] = df_test['processed_transcription'].str.split().apply(len)

In [18]:
df_train[trans_wordbag] = df_train['processed_transcription'].str.strip().apply(lambda x : x.split())
df_test[trans_wordbag] = df_test['processed_transcription'].str.strip().apply(lambda x : x.split())

In [19]:
BW_each_label = {}
for label in range(40):
    df_label = df_train[df_train['labels'] == label]
    _count = float(df_label.shape[0])
    lst_of_words = set(chain.from_iterable(list(df_label['transcription_words'])))
    lst_of_collections = [None] * len(lst_of_words)
    for i, word in enumerate(lst_of_words):
        lst_of_collections[i] = [word, round(100 * df_label['transcription_words'].apply(lambda x : word in x).sum() / _count, 2)]
    BW_each_label[label] = lst_of_collections
    
only_words_label = {}
for label in range(40):
    cur_set_words = set([x[0] for x in BW_each_label[label]])
    other_set_words = set([])
    for label2 in list(range(label)) + list(range(label + 1, 40)):
        label2_words = set([x[0] for x in BW_each_label[label2]])
        other_set_words.update(label2_words)
    _word_diff = cur_set_words - other_set_words
    only_words_label[label] = sorted([x for x in BW_each_label[label] if x[0] in _word_diff], key = lambda x : x[1], reverse = True)

In [20]:
label_unique_words_df = pd.DataFrame(data = {'label': range(40), 'percent_exist': [only_words_label[i] for i in range(40)]})

label_df = pd.DataFrame.from_dict(label_to_medical_specialty_dict, orient = 'index').reset_index()
label_df.columns = ['label', 'medical_specialty']
label_unique_words_df = pd.merge(label_unique_words_df, label_df, on = 'label', how = 'inner')
label_unique_words_df['words'] = label_unique_words_df['percent_exist'].apply(lambda x: [word for word, percent in x])

label_unique_words_df.to_csv('label_unique_words.csv')

In [21]:
# Gender Extraction
male_words = [' him ', ' his ', ' he ', ' hes ', ' male ', ' man ', ' gentleman ', ' sir ', ' mr ', ' mister ', ' guy ', ' boy ', ' penis ']
female_words = [' her ', ' she ', ' shes ', 'female', ' woman ', ' lady ', ' madam ', ' mrs ', ' miss ', ' ms ', ' girl ', 'pregnancy']

df_train['gender_male_count'] = df_train['processed_transcription'].str.count('|'.join(male_words))
df_train['gender_female_count'] = df_train['processed_transcription'].str.count('|'.join(female_words))
df_test['gender_male_count'] = df_test['processed_transcription'].str.count('|'.join(male_words))
df_test['gender_female_count'] = df_test['processed_transcription'].str.count('|'.join(female_words))

df_train['male'] = np.where(df_train['gender_male_count'] > df_train['gender_female_count'], 1, 0)
df_train['female'] = np.where(df_train['gender_male_count'] < df_train['gender_female_count'], 1, 0)
df_test['male'] = np.where(df_test['gender_male_count'] > df_test['gender_female_count'], 1, 0)
df_test['female'] = np.where(df_test['gender_male_count'] < df_test['gender_female_count'], 1, 0)

In [22]:
df_temp = df_train[(df_train['gender_male_count'] == 0) & (df_train['gender_female_count'] == 0)].reset_index()
df_aaaaaaa = df_test[(df_test['gender_male_count'] == 0) & (df_test['gender_female_count'] == 0)].reset_index()

print(df_temp.shape)
print(df_aaaaaaa.shape)

df_train

(1528, 15)
(390, 13)


Unnamed: 0,medical_specialty,transcription,labels,idx,transcription_len,transcription_word_count,processed_transcription,processed_transcription_len,processed_transcription_word_count,transcription_words,gender_male_count,gender_female_count,male,female
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0,0,2354,366,reason visit high pt inr history patient N yea...,1530,232,"[reason, visit, high, pt, inr, history, patien...",0,5,0,1
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1,1,2697,405,preoperative diagnosis acetabular fracture lef...,1989,244,"[preoperative, diagnosis, acetabular, fracture...",0,0,0,0
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1,2,5222,821,name procedure N selective coronary angiograph...,3834,536,"[name, procedure, N, selective, coronary, angi...",1,0,1,0
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2,3,3029,457,referring diagnosis motor neuron disease perti...,2279,287,"[referring, diagnosis, motor, neuron, disease,...",0,1,0,1
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0,4,3391,534,chief complaint dental pain history present il...,2354,330,"[chief, complaint, dental, pain, history, pres...",0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3964,Neurology,"PROBLEMS AND ISSUES:,1. Headaches, nausea, an...",4,3995,4132,631,problems issues N headaches nausea dizziness c...,2922,386,"[problems, issues, N, headaches, nausea, dizzi...",0,6,0,1
3965,Surgery,"PREOPERATIVE DIAGNOSIS: , Anemia.,PROCEDURE:, ...",1,3996,932,136,preoperative diagnosis anemia procedure upper ...,688,89,"[preoperative, diagnosis, anemia, procedure, u...",0,0,0,0
3966,Surgery,"1. Odynophagia.,2. Dysphagia.,3. Gastroesop...",1,3997,1702,242,N odynophagia N dysphagia N gastroesophageal r...,1212,153,"[N, odynophagia, N, dysphagia, N, gastroesopha...",0,1,0,1
3967,Gastroenterology,The patient's abdomen was prepped and draped i...,5,3998,1235,183,patient s abdomen prepped draped usual sterile...,913,118,"[patient, s, abdomen, prepped, draped, usual, ...",0,0,0,0


In [23]:
# Columns with unique words in each label
unique_word_dict = dict(zip(label_unique_words_df['label'], label_unique_words_df['words']))

In [24]:
# Unique words count assign
for label in range(40):
    df_train['word_count_unique_in_' + str(label)] = df_train['processed_transcription'].str.count(' | '.join(unique_word_dict[label]))
    df_test['word_count_unique_in_' + str(label)] = df_test['processed_transcription'].str.count(' | '.join(unique_word_dict[label]))

In [25]:
def choose_one_age(age_lst):
    one_dim_list = [item for tuple in age_lst for item in tuple if item != '']
    if len(set(one_dim_list)) == 1:
        return int(one_dim_list[0])
    return None

In [26]:
# Age Extraction
age_regex = ['(\d{1,2})-year-old', '(\d{1,2}) y/o', '(\d{1,2})y/o', '(\d{1,2}) years old']

df_train['age_extractions'] = df_train['transcription'].str.findall('|'.join(age_regex))
df_test['age_extractions'] = df_test['transcription'].str.findall('|'.join(age_regex))

df_train['age'] = df_train['age_extractions'].apply(lambda x : choose_one_age(x))
df_test['age'] = df_test['age_extractions'].apply(lambda x : choose_one_age(x))

df_train['age'] = df_train['age'].fillna(0)
df_test['age'] = df_test['age'].fillna(0)

# less than 1 year old case:
young_age_regex =  ['(\d{1,3})-day-old']
baby_words = ['premature baby']

df_train['young_age_extractions'] = df_train['transcription'].str.findall('|'.join(young_age_regex))
df_test['young_age_extractions'] = df_test['transcription'].str.findall('|'.join(young_age_regex))

df_train['younger_than_1'] = np.where(df_train['young_age_extractions'].apply(lambda x : len(x)) > 0, 1, 0)
df_test['younger_than_1'] = np.where(df_test['young_age_extractions'].apply(lambda x : len(x)) > 0, 1, 0)

df_train['younger_than_1'] = np.where(df_train['transcription'].str.count('|'.join(baby_words)) > 0, 1, df_train['younger_than_1'])
df_test['younger_than_1'] = np.where(df_test['transcription'].str.count('|'.join(baby_words)) > 0, 1, df_test['younger_than_1'])

In [27]:
df_temp = df_train[(df_train['age'].isnull()) & (df_train['younger_than_1'] == 0)].reset_index()
df_temp_2 = df_test[(df_test['age'].isnull()) & (df_test['younger_than_1'] == 0)].reset_index()

print(df_temp.shape)
print(df_temp_2.shape)

(0, 59)
(0, 57)


In [28]:
# Number of History & Medical records count
history_regex = [', *(\d+)\.[^\d]']

df_train['count_listing_nums'] = df_train['transcription'].str.findall('|'.join(history_regex)).apply(lambda x : len(x))
df_test['count_listing_nums'] = df_test['transcription'].str.findall('|'.join(history_regex)).apply(lambda x : len(x))

In [30]:
drop_cols = ['gender_male_count', 'gender_female_count', 'age_extractions', 'young_age_extractions', 'word_count_unique_in_31', 'word_count_unique_in_36']
df_train = df_train.drop(columns = drop_cols, axis = 1)
df_test = df_test.drop(columns = drop_cols, axis = 1)

df_train.to_csv('df_train_processed.csv')
df_test.to_csv('df_test_processed.csv')