# Data Preprocessing and Analysis

In [30]:
!pip install sentence-transformers



In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [32]:
df = pd.read_csv("/content/BEA 2024 Task Data Extended_new.csv")
df.head()

Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time
0,622,A 27-year-old man comes to the emergency depar...,Choristoma,Ciliary epithelium,Endothelial cells,Gastric mucosa,Striated muscle,,,,,,D,Gastric mucosa,Text,STEP 1,0.38,123.96
1,440,"A 39-year-old woman, gravida 2, para 2, comes ...",Future fertility plans,Hypertension,Obesity,Patient age,Stage of disease,,,,,,E,Stage of disease,Text,STEP 3,0.5,77.53
2,12,An 18-year-old primigravid woman comes for her...,Repeat measurement of MSAFP concentration,"Triple screening for MSAFP, serum ÃŽÂ²-hCG, an...",Ultrasonography,Amniocentesis for measurement of ÃŽÂ±-fetoprot...,Amniocentesis for chromosomal analysis,,,,,,C,Ultrasonography,Text,STEP 2,0.58,65.15
3,5,"A 26-year-old man, who is admitted to the hosp...",There is no evidence of a familial coagulation...,Half of their daughters will have a clinically...,The sons of their daughters will be at risk fo...,Their sons will be at risk for a clinically ev...,Their sons and daughters will be at risk for a...,,,,,,C,The sons of their daughters will be at risk fo...,Text,STEP 3,0.56,105.24
4,221,A 60-year-old man had a total thyroidectomy an...,Babinski sign present bilaterally,Chvostek sign,Deviation of the tongue to the left side,A drooping left shoulder,Hyporeflexia,,,,,,B,Chvostek sign,Text,STEP 3,0.22,57.02


In [33]:
df.fillna(0, inplace=True)
df.head(2)


Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time
0,622,A 27-year-old man comes to the emergency depar...,Choristoma,Ciliary epithelium,Endothelial cells,Gastric mucosa,Striated muscle,0,0,0,0,0,D,Gastric mucosa,Text,STEP 1,0.38,123.96
1,440,"A 39-year-old woman, gravida 2, para 2, comes ...",Future fertility plans,Hypertension,Obesity,Patient age,Stage of disease,0,0,0,0,0,E,Stage of disease,Text,STEP 3,0.5,77.53


In [34]:
cols = df.columns
cols


Index(['ItemNum', 'ItemStem_Text', 'Answer__A', 'Answer__B', 'Answer__C',
       'Answer__D', 'Answer__E', 'Answer__F', 'Answer__G', 'Answer__H',
       'Answer__I', 'Answer__J', 'Answer_Key', 'Answer_Text', 'ItemType',
       'EXAM', 'Difficulty', 'Response_Time'],
      dtype='object')

In [35]:
df.ItemType.value_counts()

ItemType
Text    595
PIX      72
Name: count, dtype: int64

In [36]:
df.EXAM.value_counts()

EXAM
STEP 1    273
STEP 2    220
STEP 3    174
Name: count, dtype: int64

In [37]:
X_cols = ['ItemNum', 'ItemStem_Text', 'Answer__A', 'Answer__B', 'Answer__C',
          'Answer__D', 'Answer__E', 'Answer__F', 'Answer__G', 'Answer__H',
          'Answer__I', 'Answer__J', 'Answer_Key', 'Answer_Text']
y_1 = df['Difficulty']
y_2 = df['Response_Time']


In [38]:
df[df['ItemNum'] == 285]

df[X_cols] = df[X_cols].astype('str')

# Dividing data into train and test set

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[X_cols], df.index, test_size=0.2, random_state=1)

y_train1, y_test1 = y_1[y_train], y_1[y_test]
y_train2, y_test2 = y_2[y_train], y_2[y_test]

X_train1, X_test1 = X_train, X_test
X_train2, X_test2 = X_train, X_test

#  Vectorizing input

In [40]:
# reference: https://huggingface.co/pritamdeka/S-PubMedBert-MS-MARCO
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')



In [41]:
def vectorizer(data):
    data.columns = data.columns.str.strip()
    data_vec = []
    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        ItemStem_Text_vec = model.encode(row['ItemStem_Text'])
        cnt = 0
        temp_vec = np.zeros(768)
        for option in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']:
            if row['Answer__' + option] != '0':
                cnt += 1
                try:
                    temp_vec += model.encode(row['Answer__' + option])
                except:
                    print(f"Error in {row['Answer__' + option]}")
                    cnt -= 1
        temp_vec /= cnt
        Answer_vec = model.encode(row['Answer_Text'])
        final_vec = ItemStem_Text_vec + temp_vec + Answer_vec
        data_vec.append(final_vec)
    return np.array(data_vec)

X_vec = vectorizer(df[X_cols])
X_vec.shape

100%|██████████| 667/667 [13:31<00:00,  1.22s/it]


(667, 768)

In [42]:
scaler = StandardScaler()
X_vec_scaled = scaler.fit_transform(X_vec)

In [43]:
# LSTM expects the input data to be of shape (samples, timesteps, features)
# Since our data is not a sequence, we reshape it to (samples, 1, features)
X_vec_scaled = X_vec_scaled.reshape((X_vec_scaled.shape[0], 1, X_vec_scaled.shape[1]))

# LSTM Model

In [44]:
def build_lstm_model():
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1, 768)),
        layers.LSTM(256, activation='relu', return_sequences=True),
        layers.Dropout(0.2),  # Adding dropout regularization
        layers.LSTM(128, activation='relu', return_sequences=True),
        layers.Dropout(0.2),  # Adding dropout regularization
        layers.LSTM(64, activation='relu', return_sequences=False),
        layers.Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss=tf.keras.losses.MeanSquaredError(),
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model


# Implement K-Fold Cross-Validation

In [45]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
y1_predict_full = np.zeros(len(X_vec_scaled))
y2_predict_full = np.zeros(len(X_vec_scaled))
rmse_scores_1 = []

# LSTM Model - Predicting Difficulty

In [46]:
for train_index, test_index in kfold.split(X_vec_scaled):
    X_train, X_test = X_vec_scaled[train_index], X_vec_scaled[test_index]
    y_train1, y_test1 = y_1[train_index], y_1[test_index]
    y_train2, y_test2 = y_2[train_index], y_2[test_index]

    lstm_model_1 = build_lstm_model()
    lstm_model_1.fit(X_train, y_train1, epochs=100, verbose=1)

    y1_predict = lstm_model_1.predict(X_test)
    y1_predict_full[test_index] = y1_predict.flatten()
    mse = mean_squared_error(y_test1, y1_predict)
    rmse = np.sqrt(mse)
    rmse_scores_1.append(rmse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [47]:
print("RMSE for Difficulty with K-Fold Cross-Validation:", np.mean(rmse_scores_1))
print('_________________________________________________________________')
lstm_model_1.summary()

RMSE for Difficulty with K-Fold Cross-Validation: 0.33671605045449876
_________________________________________________________________
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 1, 256)            1049600   
                                                                 
 dropout_8 (Dropout)         (None, 1, 256)            0         
                                                                 
 lstm_13 (LSTM)              (None, 1, 128)            197120    
                                                                 
 dropout_9 (Dropout)         (None, 1, 128)            0         
                                                                 
 lstm_14 (LSTM)              (None, 64)                49408     
                                                                 
 dense_4 (Dense)             (None, 1)            

# LSTM Model - Predicting Response Time

In [48]:
rmse_scores_2 = []

for train_index, test_index in kfold.split(X_vec_scaled):
    X_train, X_test = X_vec_scaled[train_index], X_vec_scaled[test_index]
    y_train2, y_test2 = y_2[train_index], y_2[test_index]

    lstm_model_2 = build_lstm_model()
    lstm_model_2.fit(X_train, y_train2, epochs=100, verbose=1)

    y2_predict = lstm_model_2.predict(X_test)
    y2_predict_full[test_index] = y2_predict.flatten()
    mse = mean_squared_error(y_test2, y2_predict)
    rmse = np.sqrt(mse)
    rmse_scores_2.append(rmse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [49]:
print("RMSE for Response Time with K-Fold Cross-Validation:", np.mean(rmse_scores_2))
print('_________________________________________________________________')
lstm_model_2.summary()

RMSE for Response Time with K-Fold Cross-Validation: 32.76319252359342
_________________________________________________________________
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_27 (LSTM)              (None, 1, 256)            1049600   
                                                                 
 dropout_18 (Dropout)        (None, 1, 256)            0         
                                                                 
 lstm_28 (LSTM)              (None, 1, 128)            197120    
                                                                 
 dropout_19 (Dropout)        (None, 1, 128)            0         
                                                                 
 lstm_29 (LSTM)              (None, 64)                49408     
                                                                 
 dense_9 (Dense)             (None, 1)           

# Add predicted values to dataframe

In [50]:
df['Predicted_Difficulty'] = y1_predict_full
df['Predicted_Response_Time'] = y2_predict_full

In [51]:
df

Unnamed: 0,ItemNum,ItemStem_Text,Answer__A,Answer__B,Answer__C,Answer__D,Answer__E,Answer__F,Answer__G,Answer__H,Answer__I,Answer__J,Answer_Key,Answer_Text,ItemType,EXAM,Difficulty,Response_Time,Predicted_Difficulty,Predicted_Response_Time
0,622,A 27-year-old man comes to the emergency depar...,Choristoma,Ciliary epithelium,Endothelial cells,Gastric mucosa,Striated muscle,0,0,0,0,0,D,Gastric mucosa,Text,STEP 1,0.38,123.96,0.606428,77.637856
1,440,"A 39-year-old woman, gravida 2, para 2, comes ...",Future fertility plans,Hypertension,Obesity,Patient age,Stage of disease,0,0,0,0,0,E,Stage of disease,Text,STEP 3,0.50,77.53,0.583111,116.979691
2,12,An 18-year-old primigravid woman comes for her...,Repeat measurement of MSAFP concentration,"Triple screening for MSAFP, serum ÃŽÂ²-hCG, an...",Ultrasonography,Amniocentesis for measurement of ÃŽÂ±-fetoprot...,Amniocentesis for chromosomal analysis,0,0,0,0,0,C,Ultrasonography,Text,STEP 2,0.58,65.15,0.582178,57.811470
3,5,"A 26-year-old man, who is admitted to the hosp...",There is no evidence of a familial coagulation...,Half of their daughters will have a clinically...,The sons of their daughters will be at risk fo...,Their sons will be at risk for a clinically ev...,Their sons and daughters will be at risk for a...,0,0,0,0,0,C,The sons of their daughters will be at risk fo...,Text,STEP 3,0.56,105.24,0.658254,99.390640
4,221,A 60-year-old man had a total thyroidectomy an...,Babinski sign present bilaterally,Chvostek sign,Deviation of the tongue to the left side,A drooping left shoulder,Hyporeflexia,0,0,0,0,0,B,Chvostek sign,Text,STEP 3,0.22,57.02,0.540666,63.288338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,612,A 45-year-old man is brought to the clinic by ...,Atrophy,Decreased visual acuity,Loss of proprioception,Presence of palmomental reflex,Ptosis,0,0,0,0,0,A,Atrophy,Text,STEP 1,0.22,95.17,0.469390,87.029457
663,315,A 64-year-old man with non-Hodgkin lymphoma co...,Bleomycin,Cyclophosphamide,Cytarabine,Doxorubicin,Fluorouracil,Methotrexate,Vincristine,0,0,0,G,Vincristine,Text,STEP 1,0.14,42.72,0.372097,58.109558
664,509,A 9-month-old boy is brought to the office by ...,Abdominal ultrasonography,Antiâ€“Saccharomyces cerevisiae antibody testing,Nitroblue tetrazolium testing,Rectal swab for group B streptococcus,Skeletal survey,Stool culture for Salmonella enteritidis,0,0,0,0,E,Skeletal survey,Text,STEP 2,0.94,122.73,0.742279,54.500820
665,550,A 32-year-old man comes to the office because ...,Collagen,Double-stranded DNA,Nucleolar protein,Phospholipid,Proteins in neutrophil cytoplasm,0,0,0,0,0,A,Collagen,Text,STEP 1,0.30,86.13,0.368165,75.518097


In [52]:
df.to_csv("Predicted_Difficulty_Response_time_LSTM.csv")