In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import sentence_bleu
from jiwer import wer
from collections import Counter

# Load the textual data from a file
with open('snomed-ct-terminologies.txt', 'r') as file:
    text_data = file.read()

# PyTorch-based tokenizer
class SimpleTokenizer:
    def __init__(self):
        self.word_index = {}
        self.index_word = {}
    
    def fit_on_texts(self, texts):
        words = Counter(" ".join(texts).split())
        self.word_index = {word: idx + 1 for idx, word in enumerate(words)}
        self.index_word = {idx: word for word, idx in self.word_index.items()}
    
    def texts_to_sequences(self, texts):
        return [[self.word_index[word] for word in text.split() if word in self.word_index] for text in texts]

    def pad_sequences(self, sequences, maxlen, padding='pre'):
        padded = np.zeros((len(sequences), maxlen), dtype=int)
        for i, seq in enumerate(sequences):
            if padding == 'pre':
                padded[i, -len(seq):] = seq[-maxlen:]
            else:
                padded[i, :len(seq)] = seq[:maxlen]
        return padded

# Initialize and fit the tokenizer
tokenizer = SimpleTokenizer()
tokenizer.fit_on_texts([text_data])

total_words = len(tokenizer.word_index) + 1

# Dataset class
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, max_sequence_len):
        self.tokenizer = tokenizer
        self.max_sequence_len = max_sequence_len
        self.input_sequences = self.create_sequences(text)
    
    def create_sequences(self, text):
        input_sequences = []
        for line in text.split('\n'):
            token_list = self.tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                input_sequences.append(n_gram_sequence)
        return input_sequences
    
    def __len__(self):
        return len(self.input_sequences)
    
    def __getitem__(self, idx):
        sequence = self.input_sequences[idx]
        padded_sequence = self.tokenizer.pad_sequences([sequence], self.max_sequence_len)[0]
        X = padded_sequence[:-1]
        y = padded_sequence[-1]
        return torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long)

# Create dataset and dataloader
max_sequence_len = max([len(x) for x in tokenizer.texts_to_sequences([text_data.split('\n')[0]])])
dataset = TextDataset(text_data, tokenizer, max_sequence_len)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the LSTM model with regularization to prevent overfitting
class LSTMModel(nn.Module):
    def __init__(self, total_words, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(total_words, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, dropout=0.2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, total_words)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

model = LSTMModel(total_words, embedding_dim=100, hidden_dim=150)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in dataloader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
def generate_text(seed_text, next_words, max_sequence_len):
    gen_text = " "
    model.eval()
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tokenizer.pad_sequences([token_list], max_sequence_len)[0]
        token_list_tensor = torch.tensor(token_list[:-1], dtype=torch.long).unsqueeze(0)
        with torch.no_grad():
            predicted = model(token_list_tensor)
            predicted_word_index = torch.argmax(predicted).item()
            output_word = tokenizer.index_word.get(predicted_word_index, "")
            seed_text += " " + output_word
    return seed_text

# Generate new text data samples
ref = "Labelled drug-genetic interaction medication error"
seed_text = "Labelled drug-genetic"
reference_texts = ref.split()#text_data.split('\n')[0]
length = len(reference_texts)
generated_text = generate_text(seed_text, length, max_sequence_len)
print("Generated Text: ", generated_text)

# Bias detection and correction 
def detect_bias(text):
    gender_words = {'he': 0, 'she': 0}
    for word in text.split():
        if word.lower() in gender_words:
            gender_words[word.lower()] += 1
    return gender_words

def correct_bias(text):
    gender_words = detect_bias(text)
    if gender_words['he'] > gender_words['she']:
        text = text.replace('he', 'she', gender_words['he'] - gender_words['she'])
    elif gender_words['she'] > gender_words['he']:
        text = text.replace('she', 'he', gender_words['she'] - gender_words['he'])
    return text
def remove_repeated_words(text):
    words = text.split()
    result = []
    for i, word in enumerate(words):
        # Add the word if it's not the same as the previous one
        if i == 0 or word != words[i - 1]:
            result.append(word)
    return ' '.join(result)



# Evaluate the synthesized data using Word Error Rate (WER) and BLEU metrics
generated_text= remove_repeated_words(generated_text)
wer_score = wer(reference_texts, generated_text)
bleu_score = sentence_bleu(reference_texts, generated_text.split())

print(f"Word Error Rate (WER): {wer_score}")
print(f"BLEU Score: {bleu_score}")