# Data Preprocessing and Analysis

In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/171.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (1

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv("/content/BEA 2024 Task Data Extended_new.csv")
df.head()

Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time
0,622,A 27-year-old man comes to the emergency depar...,Choristoma,Ciliary epithelium,Endothelial cells,Gastric mucosa,Striated muscle,,,,,,D,Gastric mucosa,Text,STEP 1,0.38,123.96
1,440,"A 39-year-old woman, gravida 2, para 2, comes ...",Future fertility plans,Hypertension,Obesity,Patient age,Stage of disease,,,,,,E,Stage of disease,Text,STEP 3,0.5,77.53
2,12,An 18-year-old primigravid woman comes for her...,Repeat measurement of MSAFP concentration,"Triple screening for MSAFP, serum ÃŽÂ²-hCG, an...",Ultrasonography,Amniocentesis for measurement of ÃŽÂ±-fetoprot...,Amniocentesis for chromosomal analysis,,,,,,C,Ultrasonography,Text,STEP 2,0.58,65.15
3,5,"A 26-year-old man, who is admitted to the hosp...",There is no evidence of a familial coagulation...,Half of their daughters will have a clinically...,The sons of their daughters will be at risk fo...,Their sons will be at risk for a clinically ev...,Their sons and daughters will be at risk for a...,,,,,,C,The sons of their daughters will be at risk fo...,Text,STEP 3,0.56,105.24
4,221,A 60-year-old man had a total thyroidectomy an...,Babinski sign present bilaterally,Chvostek sign,Deviation of the tongue to the left side,A drooping left shoulder,Hyporeflexia,,,,,,B,Chvostek sign,Text,STEP 3,0.22,57.02


In [4]:
df.fillna(0, inplace=True)
df.head(2)

Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time
0,622,A 27-year-old man comes to the emergency depar...,Choristoma,Ciliary epithelium,Endothelial cells,Gastric mucosa,Striated muscle,0,0,0,0,0,D,Gastric mucosa,Text,STEP 1,0.38,123.96
1,440,"A 39-year-old woman, gravida 2, para 2, comes ...",Future fertility plans,Hypertension,Obesity,Patient age,Stage of disease,0,0,0,0,0,E,Stage of disease,Text,STEP 3,0.5,77.53


In [5]:
cols = df.columns
cols

Index(['ItemNum', 'ItemStem_Text', 'Answer__A', 'Answer__B', 'Answer__C',
       'Answer__D', 'Answer__E', 'Answer__F', 'Answer__G', 'Answer__H',
       'Answer__I', 'Answer__J', 'Answer_Key', 'Answer_Text', 'ItemType',
       'EXAM', 'Difficulty', 'Response_Time'],
      dtype='object')

In [6]:
df.ItemType.value_counts()

ItemType
Text    595
PIX      72
Name: count, dtype: int64

In [7]:
df.EXAM.value_counts()

EXAM
STEP 1    273
STEP 2    220
STEP 3    174
Name: count, dtype: int64

In [8]:
X_cols = ['ItemNum', 'ItemStem_Text', 'Answer__A', 'Answer__B', 'Answer__C',
          'Answer__D', 'Answer__E', 'Answer__F', 'Answer__G', 'Answer__H',
          'Answer__I', 'Answer__J', 'Answer_Key', 'Answer_Text']
y_1 = df['Difficulty']
y_2 = df['Response_Time']

In [9]:
df[df['ItemNum'] == 285]

df[X_cols] = df[X_cols].astype('str')

# Dividing data into train and test set

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[X_cols], df.index, test_size=0.2, random_state=1)

y_train1, y_test1 = y_1[y_train], y_1[y_test]
y_train2, y_test2 = y_2[y_train], y_2[y_test]

X_train1, X_test1 = X_train, X_test
X_train2, X_test2 = X_train, X_test

# Vectorizing input

In [11]:
# reference: https://huggingface.co/pritamdeka/S-PubMedBert-MS-MARCO
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/461k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
def vectorizer(data):
    data.columns = data.columns.str.strip()
    data_vec = []
    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        ItemStem_Text_vec = model.encode(row['ItemStem_Text'])
        cnt = 0
        temp_vec = np.zeros(768)
        for option in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']:
            if row['Answer__' + option] != '0':
                cnt += 1
                try:
                    temp_vec += model.encode(row['Answer__' + option])
                except:
                    print(f"Error in {row['Answer__' + option]}")
                    cnt -= 1
        temp_vec /= cnt
        Answer_vec = model.encode(row['Answer_Text'])
        final_vec = ItemStem_Text_vec + temp_vec + Answer_vec
        data_vec.append(final_vec)
    return np.array(data_vec)

X_vec = vectorizer(df[X_cols])
X_vec.shape

100%|██████████| 667/667 [12:42<00:00,  1.14s/it]


(667, 768)

In [13]:
scaler = StandardScaler()
X_vec_scaled = scaler.fit_transform(X_vec)

# Gated recurrent units (GRUs) Model

In [24]:
def build_gru_model():
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1, 768)),
        layers.GRU(256, activation='relu', return_sequences=True),  #
        layers.Dropout(0.5),  # Added dropout layer
        layers.GRU(128, activation='relu', return_sequences=True),
        layers.Dropout(0.5),  # Added another dropout layer
        layers.GRU(64, activation='relu', return_sequences=True),
        layers.GRU(32, activation='relu', return_sequences=False),
        layers.Dense(16, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=tf.keras.losses.MeanSquaredError(),
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

# Implement K-Fold Cross-Validation

In [25]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
y1_predict_full = np.zeros(len(X_vec_scaled))
y2_predict_full = np.zeros(len(X_vec_scaled))

In [26]:
import numpy as np

X_vec_scaled = np.random.rand(667, 768)

print("Original shape:", X_vec_scaled.shape)

total_elements = np.prod(X_vec_scaled.shape)

target_shape = (X_vec_scaled.shape[0], 1, X_vec_scaled.shape[1])

X_vec_scaled = X_vec_scaled.reshape(target_shape)

print("New shape:", X_vec_scaled.shape)


Original shape: (667, 768)
New shape: (667, 1, 768)


# GRU - Predicting Difficulty

In [28]:
rmse_scores_1 = []

for train_index, test_index in kfold.split(X_vec_scaled):
    X_train, X_test = X_vec_scaled[train_index], X_vec_scaled[test_index]
    y_train1, y_test1 = y_1[train_index], y_1[test_index]
    y_train2, y_test2 = y_2[train_index], y_2[test_index]

    gru_model_1 = build_gru_model()
    gru_model_1.fit(X_train, y_train1, epochs=100, verbose=1)

    y1_predict = gru_model_1.predict(X_test)
    y1_predict_full[test_index] = y1_predict.flatten()
    mse = mean_squared_error(y_test1, y1_predict)
    rmse = np.sqrt(mse)
    rmse_scores_1.append(rmse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [29]:
print("RMSE for Difficulty with K-Fold Cross-Validation:", np.mean(rmse_scores_1))
print('_________________________________________________________________')
gru_model_1.summary()

RMSE for Difficulty with K-Fold Cross-Validation: 0.35938053397054004
_________________________________________________________________
Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_50 (GRU)                (None, 1, 256)            787968    
                                                                 
 dropout_10 (Dropout)        (None, 1, 256)            0         
                                                                 
 gru_51 (GRU)                (None, 1, 128)            148224    
                                                                 
 dropout_11 (Dropout)        (None, 1, 128)            0         
                                                                 
 gru_52 (GRU)                (None, 1, 64)             37248     
                                                                 
 gru_53 (GRU)                (None, 32)          

# GRU - Predicting Response Time

In [30]:
rmse_scores_2 = []

for train_index, test_index in kfold.split(X_vec_scaled):
    X_train, X_test = X_vec_scaled[train_index], X_vec_scaled[test_index]
    y_train2, y_test2 = y_2[train_index], y_2[test_index]

    gru_model_2 = build_gru_model()
    gru_model_2.fit(X_train, y_train2, epochs=100, verbose=1)

    y2_predict = gru_model_2.predict(X_test)
    y2_predict_full[test_index] = y2_predict.flatten()
    mse = mean_squared_error(y_test2, y2_predict)
    rmse = np.sqrt(mse)
    rmse_scores_2.append(rmse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [31]:
print("RMSE for Response Time with K-Fold Cross-Validation:", np.mean(rmse_scores_2))
print('_________________________________________________________________')
gru_model_2.summary()

RMSE for Response Time with K-Fold Cross-Validation: 34.85644216265443
_________________________________________________________________
Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_70 (GRU)                (None, 1, 256)            787968    
                                                                 
 dropout_20 (Dropout)        (None, 1, 256)            0         
                                                                 
 gru_71 (GRU)                (None, 1, 128)            148224    
                                                                 
 dropout_21 (Dropout)        (None, 1, 128)            0         
                                                                 
 gru_72 (GRU)                (None, 1, 64)             37248     
                                                                 
 gru_73 (GRU)                (None, 32)         

# Add predicted values to dataframe

In [32]:
df['Predicted_Difficulty'] = y1_predict_full
df['Predicted_Response_Time'] = y2_predict_full

In [33]:
df

Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time,Predicted_Difficulty,Predicted_Response_Time
0,622,A 27-year-old man comes to the emergency depar...,Choristoma,Ciliary epithelium,Endothelial cells,Gastric mucosa,Striated muscle,0,0,0,0,0,D,Gastric mucosa,Text,STEP 1,0.38,123.96,0.661971,124.965218
1,440,"A 39-year-old woman, gravida 2, para 2, comes ...",Future fertility plans,Hypertension,Obesity,Patient age,Stage of disease,0,0,0,0,0,E,Stage of disease,Text,STEP 3,0.50,77.53,0.592130,80.350830
2,12,An 18-year-old primigravid woman comes for her...,Repeat measurement of MSAFP concentration,"Triple screening for MSAFP, serum ÃŽÂ²-hCG, an...",Ultrasonography,Amniocentesis for measurement of ÃŽÂ±-fetoprot...,Amniocentesis for chromosomal analysis,0,0,0,0,0,C,Ultrasonography,Text,STEP 2,0.58,65.15,0.483724,64.304619
3,5,"A 26-year-old man, who is admitted to the hosp...",There is no evidence of a familial coagulation...,Half of their daughters will have a clinically...,The sons of their daughters will be at risk fo...,Their sons will be at risk for a clinically ev...,Their sons and daughters will be at risk for a...,0,0,0,0,0,C,The sons of their daughters will be at risk fo...,Text,STEP 3,0.56,105.24,0.189600,58.340805
4,221,A 60-year-old man had a total thyroidectomy an...,Babinski sign present bilaterally,Chvostek sign,Deviation of the tongue to the left side,A drooping left shoulder,Hyporeflexia,0,0,0,0,0,B,Chvostek sign,Text,STEP 3,0.22,57.02,0.213452,86.225845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,612,A 45-year-old man is brought to the clinic by ...,Atrophy,Decreased visual acuity,Loss of proprioception,Presence of palmomental reflex,Ptosis,0,0,0,0,0,A,Atrophy,Text,STEP 1,0.22,95.17,0.595106,68.791489
663,315,A 64-year-old man with non-Hodgkin lymphoma co...,Bleomycin,Cyclophosphamide,Cytarabine,Doxorubicin,Fluorouracil,Methotrexate,Vincristine,0,0,0,G,Vincristine,Text,STEP 1,0.14,42.72,0.512005,83.412666
664,509,A 9-month-old boy is brought to the office by ...,Abdominal ultrasonography,Antiâ€“Saccharomyces cerevisiae antibody testing,Nitroblue tetrazolium testing,Rectal swab for group B streptococcus,Skeletal survey,Stool culture for Salmonella enteritidis,0,0,0,0,E,Skeletal survey,Text,STEP 2,0.94,122.73,0.350289,54.325935
665,550,A 32-year-old man comes to the office because ...,Collagen,Double-stranded DNA,Nucleolar protein,Phospholipid,Proteins in neutrophil cytoplasm,0,0,0,0,0,A,Collagen,Text,STEP 1,0.30,86.13,0.213742,56.468472


In [23]:
df.to_csv("Predicted_Difficulty_Response_time_GRU.csv")