<a href="https://colab.research.google.com/github/revatishelat/DST_A2/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
# Load the data from CSV
train_data =pd.read_csv("https://raw.githubusercontent.com/sebischair/Medical-Abstracts-TC-Corpus/main/medical_tc_train.csv")
test_data =pd.read_csv("https://raw.githubusercontent.com/sebischair/Medical-Abstracts-TC-Corpus/main/medical_tc_test.csv")
labels = pd.read_csv("https://raw.githubusercontent.com/sebischair/Medical-Abstracts-TC-Corpus/main/medical_tc_labels.csv")


In [3]:
train_data.head()

Unnamed: 0,condition_label,medical_abstract
0,5,Tissue changes around loose prostheses. A cani...
1,1,Neuropeptide Y and neuron-specific enolase lev...
2,2,"Sexually transmitted diseases of the colon, re..."
3,1,Lipolytic factors associated with murine and h...
4,3,Does carotid restenosis predict an increased r...


In [4]:
test_data.head(10)

Unnamed: 0,condition_label,medical_abstract
0,3,Obstructive sleep apnea following topical orop...
1,5,Neutrophil function and pyogenic infections in...
2,5,A phase II study of combined methotrexate and ...
3,1,Flow cytometric DNA analysis of parathyroid tu...
4,4,Paraneoplastic vasculitic neuropathy: a treata...
5,1,Treatment of childhood angiomatous diseases wi...
6,1,Expression of major histocompatibility complex...
7,1,Questionable role of CNS radioprophylaxis in t...
8,5,Reversibility of hepatic fibrosis in experimen...
9,2,Current status of duplex Doppler ultrasound in...


In [5]:
# Split the data into training and testing sets
train_texts, train_labels = train_data['medical_abstract'], train_data['condition_label']
test_texts, test_labels = test_data['medical_abstract'], test_data['condition_label']



In [6]:
# Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)



In [7]:
# Convert text data to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)



In [8]:
train_sequences

[[185,
  167,
  2027,
  7737,
  2879,
  7,
  2322,
  377,
  8,
  1320,
  2,
  129,
  3,
  22,
  6844,
  938,
  2,
  1,
  1,
  1991,
  1088,
  7,
  1025,
  12,
  7477,
  2,
  5,
  761,
  4,
  5,
  565,
  175,
  3,
  2,
  61,
  63,
  6,
  2,
  4857,
  1028,
  5,
  249,
  667,
  2,
  61,
  11,
  570,
  4,
  1226,
  5,
  1069,
  12,
  2090,
  1569,
  3,
  110,
  1023,
  15,
  125,
  2515,
  175,
  274,
  26,
  165,
  24,
  205,
  24,
  2,
  144,
  3,
  261,
  620,
  2117,
  8,
  2,
  61,
  5,
  761,
  13,
  5,
  565,
  2,
  2515,
  159,
  24,
  459,
  16,
  1490,
  19,
  4,
  1736,
  2553,
  175,
  5787,
  2,
  5317,
  1205,
  3,
  4857,
  4,
  2,
  2449,
  1073,
  4,
  1301,
  610,
  21,
  2,
  110,
  3,
  2214,
  99,
  2,
  451,
  58,
  59,
  5317,
  533,
  4,
  1520,
  4857,
  1871,
  15,
  2,
  2322,
  377,
  14,
  2434,
  12,
  7477,
  2,
  673,
  3,
  2144,
  192,
  7,
  2322,
  377,
  5318,
  2,
  42,
  3,
  380,
  4858,
  707,
  1674,
  6,
  2,
  5537,
  7237,
  3,
  5319,
  13,
  

In [9]:
test_sequences

[[1239,
  776,
  1998,
  243,
  3423,
  6380,
  1216,
  5,
  1,
  4525,
  468,
  126,
  535,
  2,
  225,
  3,
  22,
  554,
  1619,
  2114,
  759,
  15,
  4014,
  8,
  2,
  1336,
  3,
  554,
  1619,
  1271,
  43,
  776,
  38,
  653,
  2,
  1352,
  15,
  6124,
  6,
  23,
  2114,
  759,
  4014,
  8,
  2,
  294,
  3,
  1239,
  776,
  1998,
  248,
  2504,
  832,
  4525,
  249,
  773,
  4,
  44,
  989,
  85,
  818,
  37,
  36,
  1289,
  72,
  2278,
  220,
  6780,
  776,
  126,
  28,
  65,
  5814,
  6322,
  22,
  1,
  2820,
  10,
  358,
  16,
  41,
  42,
  6322,
  1432,
  1626,
  8,
  98,
  273,
  4,
  6380,
  1216,
  1,
  28,
  2,
  1,
  2820,
  3423,
  1216,
  10,
  154,
  127,
  64,
  1994,
  1,
  4,
  18,
  235,
  2782,
  1,
  7,
  1204,
  393,
  10,
  117,
  28,
  2820,
  273,
  47,
  163,
  1,
  205,
  28,
  67,
  42,
  6322,
  72,
  776,
  372,
  10,
  50,
  29,
  337,
  28,
  67,
  42,
  6322,
  4,
  776,
  256,
  803,
  10,
  216,
  28,
  67,
  6322,
  1239,
  6959,
  4,
  1,
  1,
  

In [10]:
# Pad sequences to ensure consistent length
max_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')



In [11]:
# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_length))
model.add(LSTM(128))
model.add(Dense(5, activation='softmax'))



In [12]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In order to correct the out of bounds error, we substract 1 from each label. So, now we have the corresponding classes:
0 : Neoplasms\
1 : Digestive system diseases\
2 : Nervous system diseases\
3 : Cardiovascular diseases\
4 : General pathological conditions

In [18]:

train_labels -= 1
test_labels -= 1

In [16]:
#

4

In [19]:
# Train the model
model.fit(train_padded, train_labels, epochs=5)

# model.compile(loss = CategoricalCrossentropy(), optimizer = Adam(), metrics=['accuracy'])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e2fb03b56f0>

In [20]:
# Evaluate the model on test data
predictions = model.predict(test_padded)





In [21]:
# Convert predictions to labels
predicted_labels = predictions.argmax(axis=1)



In [22]:
# Compute confusion matrix
conf_matrix = confusion_matrix(test_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)



Confusion Matrix:
[[456  30   2   6 139]
 [ 40  50   2   7 200]
 [ 30  40  14  27 274]
 [ 10   7   1 321 271]
 [142 103  20 164 532]]


In [23]:
# classification report
class_report = classification_report(test_labels, predicted_labels)
print("\nClassification Report:")
print(class_report)



Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.72      0.70       633
           1       0.22      0.17      0.19       299
           2       0.36      0.04      0.07       385
           3       0.61      0.53      0.57       610
           4       0.38      0.55      0.45       961

    accuracy                           0.48      2888
   macro avg       0.45      0.40      0.39      2888
weighted avg       0.47      0.48      0.45      2888



https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-rnn-for-text-classification-tasks