In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('St_Paul_hospital_train.csv')  # Adjust path if necessary

In [3]:
data.head()

Unnamed: 0,medical_text,diagnosis
0,Sensory ataxic hemiparesis in thalamic hemorrh...,5
1,An analysis of abnormalities of the retinoblas...,5
2,Enteric neuronal autoantibodies in pseudoobstr...,2
3,Scintigraphic measurement of oropharyngeal tra...,3
4,The tissue origin of low back pain and sciatic...,5


In [4]:
# Partition the data: 70% training, 15% validation, 15% test
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [5]:
# Display the sizes of each partition
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 7074
Validation set size: 1516
Test set size: 1516


In [6]:
#full implementation for training the Char-RNN

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Combine the text data into a single string
text = " ".join(train_data['medical_text']) 

In [8]:
# Step 1: Create character-to-index and index-to-character mappings
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

# Convert the entire text into a sequence of integer indices
input_text = [char_to_idx[char] for char in text]

In [None]:
# Step 2: Prepare sequences and labels for training
seq_length = 100  # Length of each sequence for training
sequences = [input_text[i: i + seq_length + 1] for i in range(len(input_text) - seq_length)]
X = np.array([sequence[:-1] for sequence in sequences])  # Input sequences
y = np.array([sequence[-1] for sequence in sequences])   # Next character as label


In [None]:
# Step 3: Define the Char-RNN Model
model = Sequential([
    Embedding(input_dim=len(chars), output_dim=64, input_length=seq_length),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(len(chars), activation="softmax")
])

In [None]:
# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")


In [None]:
model.summary()

In [None]:
# Step 4: Train the model
history = model.fit(X, y, epochs=10, batch_size=128)


In [None]:
# Step 5: Generate synthetic text
def generate_text(model, start_text, gen_length=200):
    # Convert start_text to character indices
    input_eval = [char_to_idx[char] for char in start_text]
    generated_text = start_text

    for _ in range(gen_length):
        # Pad the input sequence to the required length
        input_eval_padded = pad_sequences([input_eval], maxlen=seq_length, padding="pre")
        
        # Predict the next character index
        predictions = model.predict(input_eval_padded, verbose=0)
        predicted_id = np.argmax(predictions[-1])
        
        # Append the predicted character to generated_text
        generated_text += idx_to_char[predicted_id]
        
        # Update input sequence for the next prediction
        input_eval.append(predicted_id)
    
    return generated_text


In [None]:
# Generate a sample of synthetic medical text
start_text = "Patient presents with "
print(generate_text(model, start_text=start_text, gen_length=200))

In [None]:
#Question 2: Diagnosis Classification with Sequential RNN Models

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU


In [None]:
# Text Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text_column'])
X_train = tokenizer.texts_to_sequences(train_data['text_column'])
X_test = tokenizer.texts_to_sequences(test_data['text_column'])


In [None]:
# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train, maxlen=max_length, padding="post")
X_test_padded = pad_sequences(X_test, maxlen=max_length, padding="post")


In [None]:

# Model definition
model_rnn = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    GRU(64),
    Dense(1, activation="sigmoid")
])


In [None]:
model_rnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [None]:
# Training
history_rnn = model_rnn.fit(X_train_padded, train_data['label_column'], epochs=5, validation_split=0.2)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['text_column']).toarray()
X_test_tfidf = vectorizer.transform(test_data['text_column']).toarray()

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, train_data['label_column'])

# Predict on the test set
y_pred = rf_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(test_data['label_column'], y_pred)
print("Bag-of-Words Model Accuracy:", accuracy)


In [None]:
#combine predictions from the RNN model and Bag-of-Words model using a simple ensemble approach.

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

# Train ensemble model with Logistic Regression and Random Forest
ensemble_clf = VotingClassifier(estimators=[
    ('rnn', LogisticRegression()),  # Use logistic regression for simplicity
    ('rf', rf_classifier)
], voting='soft')

ensemble_clf.fit(X_train_tfidf, train_data['label_column'])

# Evaluate ensemble model
ensemble_pred = ensemble_clf.predict(X_test_tfidf)
ensemble_accuracy = accuracy_score(test_data['label_column'], ensemble_pred)
print("Ensemble Model Accuracy:", ensemble_accuracy)
