In [35]:
# Complete Fixed Implementation - No Masking Required

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import pickle

# Load the CSV dataset
df = pd.read_csv('pharmacy_abbreviations.csv')
print(f"Total abbreviations: {len(df)}")

# Create input and output pairs
abbreviations = df['Abbreviation'].str.lower().values
full_forms = df['Full Form'].values

# Create character-level mappings with special tokens
all_text = ''.join(abbreviations) + ''.join(full_forms)
chars = sorted(list(set(all_text)))

# Add special tokens
char_to_idx = {c: i+3 for i, c in enumerate(chars)}
char_to_idx['<PAD>'] = 0  # Padding token
char_to_idx['<START>'] = 1  # Start token
char_to_idx['<END>'] = 2  # End token
idx_to_char = {i: c for c, i in char_to_idx.items()}

vocab_size = len(char_to_idx)
print(f"Vocabulary size: {vocab_size}")

# Prepare input sequences (abbreviations)
X = [[char_to_idx.get(char.lower(), 0) for char in abbr] for abbr in abbreviations]

# Prepare output sequences (full forms) with start and end tokens
y_input = [[char_to_idx['<START>']] + [char_to_idx.get(char.lower(), 0) for char in full] for full in full_forms]
y_output = [[char_to_idx.get(char.lower(), 0) for char in full] + [char_to_idx['<END>']] for full in full_forms]

# Find maximum lengths
max_abbr_len = max(len(seq) for seq in X)
max_full_len = max(max(len(seq) for seq in y_input), max(len(seq) for seq in y_output))

print(f"Maximum abbreviation length: {max_abbr_len}")
print(f"Maximum full form length: {max_full_len + 1}")  # +1 for END token

# Pad sequences
X_padded = pad_sequences(X, maxlen=max_abbr_len, padding='post')
y_input_padded = pad_sequences(y_input, maxlen=max_full_len, padding='post')
y_output_padded = pad_sequences(y_output, maxlen=max_full_len, padding='post')

# Split into training and validation sets
X_train, X_val, y_input_train, y_input_val, y_output_train, y_output_val = train_test_split(
    X_padded, y_input_padded, y_output_padded, test_size=0.2, random_state=42
)

# Reshape y_output_train and y_output_val for sparse_categorical_crossentropy
y_output_train = y_output_train.reshape(y_output_train.shape[0], y_output_train.shape[1], 1)
y_output_val = y_output_val.reshape(y_output_val.shape[0], y_output_val.shape[1], 1)

# Define the model - encoder-decoder model
# Encoder
encoder_inputs = Input(shape=(max_abbr_len,))
encoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True, dropout=0.2)(encoder_embedding)
encoder_outputs, state_h, state_c = encoder_lstm
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_full_len,))
decoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, dropout=0.2)(decoder_embedding, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder_lstm)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_dense)

# Compile the model - without any sample_weight_mode
model.compile(
    optimizer='adam', 
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Print model summary
model.summary()

# Define callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)
]

# Train the model - without sample_weight
history = model.fit(
    [X_train, y_input_train], y_output_train,
    validation_data=([X_val, y_input_val], y_output_val),
    epochs=100,
    batch_size=32,
    callbacks=callbacks
)

# Save the model
model.save('pharmacy_abbreviation_model_fixed.h5')

# Save the character mappings
with open('char_mappings_fixed.pkl', 'wb') as f:
    pickle.dump({'char_to_idx': char_to_idx, 'idx_to_char': idx_to_char}, f)

# Save the parameters
with open('processed_data_fixed.pkl', 'wb') as f:
    pickle.dump({
        'max_abbr_len': max_abbr_len,
        'max_full_len': max_full_len
    }, f)

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.tight_layout()
plt.savefig('training_history_fixed.png')
plt.close()

# Function to predict full form
def predict_full_form(abbreviation, model, char_to_idx, idx_to_char, max_abbr_len, max_full_len):
    # Encode and pad the input abbreviation
    encoded_abbr = [char_to_idx.get(char.lower(), 0) for char in abbreviation]
    padded_abbr = pad_sequences([encoded_abbr], maxlen=max_abbr_len, padding='post')
    
    # Initialize decoder input with START token
    decoder_input = np.zeros((1, max_full_len))
    decoder_input[0, 0] = char_to_idx['<START>']
    
    # Generate output sequence
    output_text = []
    
    for i in range(1, max_full_len):
        # Get model prediction
        predictions = model.predict([padded_abbr, decoder_input], verbose=0)[0]
        sampled_token_index = np.argmax(predictions[i-1])
        
        # Stop if END token is predicted or we reach max length
        if sampled_token_index == char_to_idx['<END>']:
            break
        
        # Add predicted character to output (skip padding)
        if sampled_token_index > 0 and sampled_token_index != char_to_idx['<PAD>']:
            output_text.append(idx_to_char[sampled_token_index])
        
        # Update decoder input for next prediction
        decoder_input[0, i] = sampled_token_index
    
    return ''.join(output_text)

# Test with some examples
print("\n----- Testing Predictions -----")
test_abbreviations = ['RTF', 'RTPB', 'DAW', 'NP', 'RTBF', 'MD', 'DO']
for abbr in test_abbreviations:
    try:
        actual = df[df['Abbreviation'].str.lower() == abbr.lower()]['Full Form'].values[0]
        predicted = predict_full_form(abbr, model, char_to_idx, idx_to_char, max_abbr_len, max_full_len)
        match = "✓" if predicted.lower() == actual.lower() else "✗"
        
        print(f"{abbr:5} → Predicted: {predicted}")
        print(f"{' ':5}   Actual:    {actual}")
        print(f"{' ':5}   Match:     {match}")
        print()
    except:
        print(f"Error predicting {abbr}")

Total abbreviations: 256
Vocabulary size: 63
Maximum abbreviation length: 7
Maximum full form length: 56


Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 115ms/step - accuracy: 0.0309 - loss: 4.1067 - val_accuracy: 0.0399 - val_loss: 3.6487 - learning_rate: 0.0010
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.0399 - loss: 3.4577 - val_accuracy: 0.0444 - val_loss: 3.2002 - learning_rate: 0.0010
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - accuracy: 0.0405 - loss: 3.0868 - val_accuracy: 0.0406 - val_loss: 3.0939 - learning_rate: 0.0010
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step - accuracy: 0.0412 - loss: 3.0259 - val_accuracy: 0.0406 - val_loss: 3.0592 - learning_rate: 0.0010
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step - accuracy: 0.0408 - loss: 2.9941 - val_accuracy: 0.0406 - val_loss: 3.0476 - learning_rate: 0.0010
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/




----- Testing Predictions -----
RTF   → Predicted: refill too dor
        Actual:    Ready To Fill
        Match:     ✗

RTPB  → Predicted: real-tion rentrint restrint
        Actual:    Real-Time Prescription Benefit
        Match:     ✗

DAW   → Predicted: diseate dister ant
        Actual:    Dispense As Written
        Match:     ✗

NP    → Predicted: ne troc act
        Actual:    New Prescription
        Match:     ✗

RTBF  → Predicted: reall-tion restrint
        Actual:    Real-Time Benefit Format
        Match:     ✗

MD    → Predicted: medical disease
        Actual:    Medical Doctor
        Match:     ✗

DO    → Predicted: dreat intert
        Actual:    Doctor of Osteopathy
        Match:     ✗



In [None]:
#version 2
# Enhanced Pharmacy Abbreviation Predictor - Complete Implementation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Input, Dropout, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import pickle
import os
import random
import string

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# Load the CSV dataset
df = pd.read_csv('pharmacy_abbreviations.csv')
print(f"Total abbreviations: {len(df)}")
print(df.head())

# Create input and output pairs
abbreviations = df['Abbreviation'].str.lower().values
full_forms = df['Full Form'].values

# Data augmentation function
def augment_data(abbreviations, full_forms, augment_factor=2):
    """Create augmented versions of the data by:
    1. Changing case (uppercase/lowercase/title case)
    2. Adding slight misspellings
    3. Adding small variations
    """
    aug_abbreviations = list(abbreviations)
    aug_full_forms = list(full_forms)
    
    for i in range(len(abbreviations)):
        abbr = abbreviations[i]
        full = full_forms[i]
        
        for _ in range(augment_factor - 1):  # -1 because we already have the original
            # Decide on augmentation type
            aug_type = random.choice(['case', 'misspell', 'variation'])
            
            if aug_type == 'case':
                # Change case of full form
                case_type = random.choice(['lower', 'upper', 'title'])
                if case_type == 'lower':
                    aug_full = full.lower()
                elif case_type == 'upper':
                    aug_full = full.upper()
                else:
                    aug_full = full.title()
                aug_abbr = abbr
                
            elif aug_type == 'misspell':
                # Add a minor misspelling to full form (1 character change)
                if len(full) > 3:  # Only if long enough
                    pos = random.randint(0, len(full)-1)
                    chars = list(full)
                    
                    # Decide what type of misspelling
                    misspell_type = random.choice(['swap', 'substitute', 'insert', 'delete'])
                    
                    if misspell_type == 'swap' and pos < len(full) - 1:
                        # Swap two adjacent characters
                        chars[pos], chars[pos+1] = chars[pos+1], chars[pos]
                    elif misspell_type == 'substitute':
                        # Substitute a character
                        chars[pos] = random.choice(string.ascii_letters + ' -')
                    elif misspell_type == 'insert' and len(full) < 50:
                        # Insert a character
                        chars.insert(pos, random.choice(string.ascii_letters + ' -'))
                    elif misspell_type == 'delete' and len(full) > 2:
                        # Delete a character
                        chars.pop(pos)
                    
                    aug_full = ''.join(chars)
                else:
                    aug_full = full
                    
                aug_abbr = abbr
                
            else:  # variation
                # Create a variation by adding/removing spaces or changing separators
                if ' ' in full:
                    if random.random() < 0.5:
                        # Remove a space
                        parts = full.split(' ')
                        if len(parts) > 1:
                            join_idx = random.randint(0, len(parts)-2)
                            parts[join_idx] = parts[join_idx] + parts[join_idx+1]
                            parts.pop(join_idx+1)
                        aug_full = ' '.join(parts)
                    else:
                        # Change a separator
                        sep = random.choice(['-', '/', '&'])
                        aug_full = full.replace(' ', sep, 1)
                elif '-' in full:
                    # Change hyphen to space or another separator
                    sep = random.choice([' ', '/', '&'])
                    aug_full = full.replace('-', sep, 1)
                else:
                    # No easy variation, just use original
                    aug_full = full
                
                aug_abbr = abbr
            
            # Add augmented pair
            aug_abbreviations.append(aug_abbr)
            aug_full_forms.append(aug_full)
    
    return np.array(aug_abbreviations), np.array(aug_full_forms)

# Apply data augmentation
print("Applying data augmentation...")
aug_abbreviations, aug_full_forms = augment_data(abbreviations, full_forms, augment_factor=3)
print(f"Data size after augmentation: {len(aug_abbreviations)} (was {len(abbreviations)})")

# Display some augmented examples
for i in range(5):
    orig_idx = i
    aug_idx = len(abbreviations) + i
    print(f"Original: {abbreviations[orig_idx]} -> {full_forms[orig_idx]}")
    print(f"Augmented: {aug_abbreviations[aug_idx]} -> {aug_full_forms[aug_idx]}")
    print()

# Create character-level mappings with special tokens
all_text = ''.join(aug_abbreviations) + ''.join(aug_full_forms)
chars = sorted(list(set(all_text)))

# Add special tokens
char_to_idx = {c: i+3 for i, c in enumerate(chars)}
char_to_idx['<PAD>'] = 0  # Padding token
char_to_idx['<START>'] = 1  # Start token
char_to_idx['<END>'] = 2  # End token
idx_to_char = {i: c for c, i in char_to_idx.items()}

vocab_size = len(char_to_idx)
print(f"Vocabulary size: {vocab_size}")

# Prepare input sequences (abbreviations)
X = [[char_to_idx.get(char.lower(), 0) for char in abbr] for abbr in aug_abbreviations]

# Prepare output sequences (full forms) with start and end tokens
y_input = [[char_to_idx['<START>']] + [char_to_idx.get(char.lower(), 0) for char in full] for full in aug_full_forms]
y_output = [[char_to_idx.get(char.lower(), 0) for char in full] + [char_to_idx['<END>']] for full in aug_full_forms]

# Find maximum lengths
max_abbr_len = max(len(seq) for seq in X)
max_full_len = max(max(len(seq) for seq in y_input), max(len(seq) for seq in y_output))

print(f"Maximum abbreviation length: {max_abbr_len}")
print(f"Maximum full form length: {max_full_len}")

# Pad sequences
X_padded = pad_sequences(X, maxlen=max_abbr_len, padding='post')
y_input_padded = pad_sequences(y_input, maxlen=max_full_len, padding='post')
y_output_padded = pad_sequences(y_output, maxlen=max_full_len, padding='post')

# Split into training and validation sets
X_train, X_val, y_input_train, y_input_val, y_output_train, y_output_val = train_test_split(
    X_padded, y_input_padded, y_output_padded, test_size=0.2, random_state=42
)

# Reshape y_output for sparse_categorical_crossentropy
y_output_train = y_output_train.reshape(y_output_train.shape[0], y_output_train.shape[1], 1)
y_output_val = y_output_val.reshape(y_output_val.shape[0], y_output_val.shape[1], 1)

# Enhanced model with bidirectional encoder and increased capacity
def create_enhanced_model(vocab_size, max_abbr_len, max_full_len):
    # Encoder
    encoder_inputs = Input(shape=(max_abbr_len,))
    encoder_embedding = Embedding(vocab_size, 256, mask_zero=True)(encoder_inputs)
    encoder_dropout1 = Dropout(0.2)(encoder_embedding)
    
    # Bidirectional LSTM for the encoder
    encoder_bilstm = Bidirectional(LSTM(512, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2))(encoder_dropout1)
    
    # Bidirectional LSTM returns sequences and states: [output_sequences, forward_h, forward_c, backward_h, backward_c]
    encoder_outputs = encoder_bilstm[0]
    
    # Combine forward and backward states
    state_h = concatenate([encoder_bilstm[1], encoder_bilstm[3]])
    state_c = concatenate([encoder_bilstm[2], encoder_bilstm[4]])
    encoder_states = [state_h, state_c]
    
    # Decoder
    decoder_inputs = Input(shape=(max_full_len,))
    decoder_embedding = Embedding(vocab_size, 256, mask_zero=True)(decoder_inputs)
    decoder_dropout1 = Dropout(0.2)(decoder_embedding)
    
    # Decoder LSTM - with doubled units to match concatenated bidirectional encoder states
    decoder_lstm = LSTM(1024, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(decoder_dropout1, initial_state=encoder_states)
    decoder_dropout2 = Dropout(0.2)(decoder_lstm)
    
    # Output layer
    decoder_dense = Dense(vocab_size, activation='softmax')(decoder_dropout2)
    
    # Create model
    model = Model([encoder_inputs, decoder_inputs], decoder_dense)
    
    # Compile with a slightly lower learning rate for stability
    optimizer = Adam(learning_rate=0.0005)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create the enhanced model
model = create_enhanced_model(vocab_size, max_abbr_len, max_full_len)

# Print model summary
model.summary()

# Define enhanced callbacks
model_path = 'best_pharmacy_model_enhanced.h5'
callbacks = [
    ModelCheckpoint(model_path, save_best_only=True, monitor='val_loss'),
    EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001, verbose=1)
]

# Train the model with more epochs
history = model.fit(
    [X_train, y_input_train], y_output_train,
    validation_data=([X_val, y_input_val], y_output_val),
    epochs=200,  # Increased epochs
    batch_size=16,  # Smaller batch size
    callbacks=callbacks,
    verbose=1
)

# Save the final model if not already saved by callbacks
if not os.path.exists(model_path):
    model.save(model_path)

# Save the character mappings and parameters
with open('char_mappings_enhanced.pkl', 'wb') as f:
    pickle.dump({'char_to_idx': char_to_idx, 'idx_to_char': idx_to_char}, f)

with open('processed_data_enhanced.pkl', 'wb') as f:
    pickle.dump({
        'max_abbr_len': max_abbr_len,
        'max_full_len': max_full_len
    }, f)

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.tight_layout()
plt.savefig('training_history_enhanced.png')
plt.close()

# Beam search implementation for improved prediction
def predict_with_beam_search(model, abbreviation, char_to_idx, idx_to_char, max_abbr_len, max_full_len, beam_width=3):
    """Predict using beam search to consider multiple character possibilities."""
    # Encode and pad the input abbreviation
    encoded_abbr = [char_to_idx.get(char.lower(), 0) for char in abbreviation]
    padded_abbr = pad_sequences([encoded_abbr], maxlen=max_abbr_len, padding='post')
    
    # Initialize with START token
    initial_state = np.zeros((1, max_full_len))
    initial_state[0, 0] = char_to_idx['<START>']
    
    # Initialize beam with (sequence, score, state)
    # sequence is a list of character indices
    # score is the log probability of the sequence
    # state is the decoder input state
    beams = [([], 0.0, initial_state)]
    
    # Generate sequence
    for i in range(1, max_full_len):
        all_candidates = []
        
        # Expand each beam
        for seq, score, decoder_input in beams:
            # Skip if sequence already ended
            if seq and seq[-1] == char_to_idx['<END>']:
                all_candidates.append((seq, score, decoder_input))
                continue
                
            # Get predictions
            predictions = model.predict([padded_abbr, decoder_input], verbose=0)[0]
            next_char_probs = predictions[i-1]
            
            # Get top beam_width probabilities and indices
            top_indices = np.argsort(next_char_probs)[-beam_width:]
            
            # Create new candidates
            for idx in top_indices:
                # Skip PAD token
                if idx == char_to_idx['<PAD>']:
                    continue
                    
                # Create new sequence, score, and state
                new_seq = seq + [idx]
                # Use log probabilities to prevent underflow
                new_score = score + np.log(next_char_probs[idx] + 1e-10)  # Add small epsilon to prevent log(0)
                
                # Create new decoder input
                new_decoder_input = decoder_input.copy()
                new_decoder_input[0, i] = idx
                
                all_candidates.append((new_seq, new_score, new_decoder_input))
        
        # Keep only the top beam_width candidates
        # Sort by score (higher is better)
        all_candidates.sort(key=lambda x: x[1], reverse=True)
        beams = all_candidates[:beam_width]
        
        # Check if all beams have ended
        if all(beam[0] and beam[0][-1] == char_to_idx['<END>'] for beam in beams):
            break
    
    # Get best sequence
    best_seq, best_score, _ = beams[0]
    
    # Convert to string, removing END token if present
    if best_seq and best_seq[-1] == char_to_idx['<END>']:
        best_seq = best_seq[:-1]
        
    result = ''.join(idx_to_char[idx] for idx in best_seq if idx > 0 and idx != char_to_idx['<PAD>'])
    return result

# Load the best model (which might have been saved by callbacks)
best_model = load_model(model_path)

# Test with different prediction methods
print("\n----- Testing Predictions -----")
test_abbreviations = ['RTF', 'RTPB', 'DAW', 'NP', 'RTBF', 'MD', 'DO']

# Function for regular prediction (greedy search)
def predict_greedy(abbreviation, model, char_to_idx, idx_to_char, max_abbr_len, max_full_len):
    # Encode and pad the input abbreviation
    encoded_abbr = [char_to_idx.get(char.lower(), 0) for char in abbreviation]
    padded_abbr = pad_sequences([encoded_abbr], maxlen=max_abbr_len, padding='post')
    
    # Initialize decoder input with START token
    decoder_input = np.zeros((1, max_full_len))
    decoder_input[0, 0] = char_to_idx['<START>']
    
    # Generate output sequence
    output_text = []
    
    for i in range(1, max_full_len):
        # Get model prediction
        predictions = model.predict([padded_abbr, decoder_input], verbose=0)[0]
        sampled_token_index = np.argmax(predictions[i-1])
        
        # Stop if END token is predicted
        if sampled_token_index == char_to_idx['<END>']:
            break
        
        # Add predicted character to output (skip padding)
        if sampled_token_index > 0 and sampled_token_index != char_to_idx['<PAD>']:
            output_text.append(idx_to_char[sampled_token_index])
        
        # Update decoder input for next prediction
        decoder_input[0, i] = sampled_token_index
    
    return ''.join(output_text)

# Compare prediction methods
for abbr in test_abbreviations:
    try:
        actual = df[df['Abbreviation'].str.lower() == abbr.lower()]['Full Form'].values[0]
        
        # Get predictions
        greedy_result = predict_greedy(abbr, best_model, char_to_idx, idx_to_char, max_abbr_len, max_full_len)
        beam_result = predict_with_beam_search(best_model, abbr, char_to_idx, idx_to_char, max_abbr_len, max_full_len, beam_width=3)
        
        # Calculate match percentages
        def calc_match_percent(pred, actual):
            min_len = min(len(pred), len(actual))
            if min_len == 0:
                return 0
            matches = sum(p.lower() == a.lower() for p, a in zip(pred[:min_len], actual[:min_len]))
            return (matches / max(len(pred), len(actual))) * 100
            
        greedy_match = "✓" if greedy_result.lower() == actual.lower() else "✗"
        beam_match = "✓" if beam_result.lower() == actual.lower() else "✗"
        greedy_percent = calc_match_percent(greedy_result, actual)
        beam_percent = calc_match_percent(beam_result, actual)
        
        print(f"{abbr:5} → Actual:      {actual}")
        print(f"{' ':5}   Greedy:      {greedy_result}")
        print(f"{' ':5}   Beam Search: {beam_result}")
        print(f"{' ':5}   Greedy Match: {greedy_match} ({greedy_percent:.1f}%)")
        print(f"{' ':5}   Beam Match:   {beam_match} ({beam_percent:.1f}%)")
        print()
    except Exception as e:
        print(f"Error predicting {abbr}: {e}")

# Evaluate on the full dataset
print("\n===== Full Dataset Evaluation =====")

# Test on entire dataset
all_results = []
for i, row in df.iterrows():
    abbr = row['Abbreviation']
    actual = row['Full Form']
    
    try:
        # Get predictions
        greedy_result = predict_greedy(abbr, best_model, char_to_idx, idx_to_char, max_abbr_len, max_full_len)
        beam_result = predict_with_beam_search(best_model, abbr, char_to_idx, idx_to_char, max_abbr_len, max_full_len, beam_width=3)
        
        # Calculate match percentages
        def calc_match_percent(pred, actual):
            min_len = min(len(pred), len(actual))
            if min_len == 0:
                return 0
            matches = sum(p.lower() == a.lower() for p, a in zip(pred[:min_len], actual[:min_len]))
            return (matches / max(len(pred), len(actual))) * 100
            
        exact_greedy_match = greedy_result.lower() == actual.lower()
        exact_beam_match = beam_result.lower() == actual.lower()
        greedy_percent = calc_match_percent(greedy_result, actual)
        beam_percent = calc_match_percent(beam_result, actual)
        
        all_results.append({
            'Abbreviation': abbr,
            'Actual': actual,
            'Greedy Prediction': greedy_result,
            'Beam Prediction': beam_result,
            'Exact Greedy Match': exact_greedy_match,
            'Exact Beam Match': exact_beam_match,
            'Greedy Match Percent': greedy_percent,
            'Beam Match Percent': beam_percent,
            'Category': row['Category'] if 'Category' in row else 'Unknown'
        })
    except Exception as e:
        print(f"Error evaluating {abbr}: {e}")

# Create results dataframe
results_df = pd.DataFrame(all_results)

# Calculate overall metrics
print(f"Total abbreviations evaluated: {len(results_df)}")
print(f"Exact greedy match rate: {results_df['Exact Greedy Match'].mean():.4f}")
print(f"Exact beam match rate: {results_df['Exact Beam Match'].mean():.4f}")
print(f"Average greedy match percent: {results_df['Greedy Match Percent'].mean():.2f}%")
print(f"Average beam match percent: {results_df['Beam Match Percent'].mean():.2f}%")

# Analyze by category
if 'Category' in results_df.columns:
    category_metrics = results_df.groupby('Category').agg({
        'Exact Greedy Match': 'mean',
        'Exact Beam Match': 'mean',
        'Greedy Match Percent': 'mean',
        'Beam Match Percent': 'mean',
        'Abbreviation': 'count'
    }).rename(columns={'Abbreviation': 'Count'})
    
    print("\n----- Performance by Category -----")
    print(category_metrics)
    
    # Save detailed results
    results_df.to_csv('prediction_results_enhanced.csv', index=False)
    
    print("\nDetailed results saved to prediction_results_enhanced.csv")

Total abbreviations: 256
  Abbreviation                       Full Form      Category  \
0          RTF                   Ready To Fill      Workflow   
1         RTPB  Real-Time Prescription Benefit        System   
2         RTBF        Real-Time Benefit Format        System   
3          DAW             Dispense As Written  Prescription   
4           NP                New Prescription      Workflow   

                                             Context  
0  Used to indicate a prescription that has been ...  
1  Electronic system that delivers patient-specif...  
2  Format for delivering patient benefits informa...  
3  Indicates that the exact pharmaceutical produc...  
4  Indicates a new prescription that has been rec...  
Applying data augmentation...
Data size after augmentation: 768 (was 256)
Original: rtf -> Ready To Fill
Augmented: rtf -> Ready ToFill

Original: rtpb -> Real-Time Prescription Benefit
Augmented: rtf -> ready to fill

Original: rtbf -> Real-Time Benefit Forma

Epoch 1/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 401ms/step - accuracy: 0.0393 - loss: 3.6342



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 441ms/step - accuracy: 0.0393 - loss: 3.6261 - val_accuracy: 0.0298 - val_loss: 3.0269 - learning_rate: 5.0000e-04
Epoch 2/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 413ms/step - accuracy: 0.0431 - loss: 3.0083



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 446ms/step - accuracy: 0.0433 - loss: 3.0074 - val_accuracy: 0.0638 - val_loss: 2.8804 - learning_rate: 5.0000e-04
Epoch 3/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 440ms/step - accuracy: 0.0651 - loss: 2.8373



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 478ms/step - accuracy: 0.0652 - loss: 2.8361 - val_accuracy: 0.0795 - val_loss: 2.7022 - learning_rate: 5.0000e-04
Epoch 4/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495ms/step - accuracy: 0.0803 - loss: 2.6893



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 532ms/step - accuracy: 0.0804 - loss: 2.6885 - val_accuracy: 0.0914 - val_loss: 2.5973 - learning_rate: 5.0000e-04
Epoch 5/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489ms/step - accuracy: 0.0917 - loss: 2.5835



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 525ms/step - accuracy: 0.0918 - loss: 2.5827 - val_accuracy: 0.1018 - val_loss: 2.5180 - learning_rate: 5.0000e-04
Epoch 6/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 493ms/step - accuracy: 0.1000 - loss: 2.5034



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 532ms/step - accuracy: 0.1001 - loss: 2.5026 - val_accuracy: 0.1061 - val_loss: 2.4454 - learning_rate: 5.0000e-04
Epoch 7/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488ms/step - accuracy: 0.1085 - loss: 2.4184



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 524ms/step - accuracy: 0.1086 - loss: 2.4176 - val_accuracy: 0.1149 - val_loss: 2.3748 - learning_rate: 5.0000e-04
Epoch 8/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497ms/step - accuracy: 0.1173 - loss: 2.3349



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 533ms/step - accuracy: 0.1175 - loss: 2.3340 - val_accuracy: 0.1233 - val_loss: 2.2972 - learning_rate: 5.0000e-04
Epoch 9/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step - accuracy: 0.1300 - loss: 2.2379



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 537ms/step - accuracy: 0.1301 - loss: 2.2369 - val_accuracy: 0.1344 - val_loss: 2.2044 - learning_rate: 5.0000e-04
Epoch 10/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494ms/step - accuracy: 0.1450 - loss: 2.1285



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 531ms/step - accuracy: 0.1452 - loss: 2.1274 - val_accuracy: 0.1458 - val_loss: 2.1090 - learning_rate: 5.0000e-04
Epoch 11/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 501ms/step - accuracy: 0.1575 - loss: 2.0079



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 537ms/step - accuracy: 0.1577 - loss: 2.0067 - val_accuracy: 0.1581 - val_loss: 1.9930 - learning_rate: 5.0000e-04
Epoch 12/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497ms/step - accuracy: 0.1763 - loss: 1.8591



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 534ms/step - accuracy: 0.1765 - loss: 1.8579 - val_accuracy: 0.1751 - val_loss: 1.8728 - learning_rate: 5.0000e-04
Epoch 13/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500ms/step - accuracy: 0.1969 - loss: 1.7020



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 539ms/step - accuracy: 0.1972 - loss: 1.7006 - val_accuracy: 0.1948 - val_loss: 1.7360 - learning_rate: 5.0000e-04
Epoch 14/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500ms/step - accuracy: 0.2179 - loss: 1.5219



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 539ms/step - accuracy: 0.2181 - loss: 1.5206 - val_accuracy: 0.2138 - val_loss: 1.5893 - learning_rate: 5.0000e-04
Epoch 15/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497ms/step - accuracy: 0.2392 - loss: 1.3587



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 537ms/step - accuracy: 0.2395 - loss: 1.3573 - val_accuracy: 0.2333 - val_loss: 1.4270 - learning_rate: 5.0000e-04
Epoch 16/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492ms/step - accuracy: 0.2592 - loss: 1.1662



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 528ms/step - accuracy: 0.2595 - loss: 1.1654 - val_accuracy: 0.2490 - val_loss: 1.2866 - learning_rate: 5.0000e-04
Epoch 17/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 496ms/step - accuracy: 0.2864 - loss: 0.9916



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 537ms/step - accuracy: 0.2867 - loss: 0.9906 - val_accuracy: 0.2660 - val_loss: 1.1602 - learning_rate: 5.0000e-04
Epoch 18/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500ms/step - accuracy: 0.3071 - loss: 0.8312



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 539ms/step - accuracy: 0.3073 - loss: 0.8306 - val_accuracy: 0.2818 - val_loss: 1.0192 - learning_rate: 5.0000e-04
Epoch 19/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 491ms/step - accuracy: 0.3243 - loss: 0.6998



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 531ms/step - accuracy: 0.3245 - loss: 0.6991 - val_accuracy: 0.2982 - val_loss: 0.9173 - learning_rate: 5.0000e-04
Epoch 20/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497ms/step - accuracy: 0.3392 - loss: 0.5888



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 536ms/step - accuracy: 0.3394 - loss: 0.5883 - val_accuracy: 0.3119 - val_loss: 0.8079 - learning_rate: 5.0000e-04
Epoch 21/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 506ms/step - accuracy: 0.3480 - loss: 0.4915



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 544ms/step - accuracy: 0.3482 - loss: 0.4911 - val_accuracy: 0.3244 - val_loss: 0.7232 - learning_rate: 5.0000e-04
Epoch 22/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 503ms/step - accuracy: 0.3581 - loss: 0.4108



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 542ms/step - accuracy: 0.3584 - loss: 0.4104 - val_accuracy: 0.3306 - val_loss: 0.6508 - learning_rate: 5.0000e-04
Epoch 23/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500ms/step - accuracy: 0.3653 - loss: 0.3458



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 540ms/step - accuracy: 0.3655 - loss: 0.3456 - val_accuracy: 0.3361 - val_loss: 0.5921 - learning_rate: 5.0000e-04
Epoch 24/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 516ms/step - accuracy: 0.3708 - loss: 0.2958



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 555ms/step - accuracy: 0.3710 - loss: 0.2956 - val_accuracy: 0.3414 - val_loss: 0.5508 - learning_rate: 5.0000e-04
Epoch 25/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509ms/step - accuracy: 0.3759 - loss: 0.2452



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 548ms/step - accuracy: 0.3761 - loss: 0.2452 - val_accuracy: 0.3446 - val_loss: 0.5297 - learning_rate: 5.0000e-04
Epoch 26/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 506ms/step - accuracy: 0.3771 - loss: 0.2254



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 545ms/step - accuracy: 0.3773 - loss: 0.2253 - val_accuracy: 0.3443 - val_loss: 0.5263 - learning_rate: 5.0000e-04
Epoch 27/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511ms/step - accuracy: 0.3786 - loss: 0.2018



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 551ms/step - accuracy: 0.3788 - loss: 0.2017 - val_accuracy: 0.3470 - val_loss: 0.4880 - learning_rate: 5.0000e-04
Epoch 28/200
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 519ms/step - accuracy: 0.3805 - loss: 0.1810



[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 559ms/step - accuracy: 0.3807 - loss: 0.1809 - val_accuracy: 0.3468 - val_loss: 0.4858 - learning_rate: 5.0000e-04
Epoch 29/200
[1m 3/39[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m18s[0m 506ms/step - accuracy: 0.3681 - loss: 0.1539