In [None]:
# Load slots and intents from files
with open('path', 'r') as slots_file:
    slots = [eval(line.strip()) for line in slots_file]

with open('path', 'r') as intents_file:
    intents = [line.strip() for line in intents_file]

with open('path', 'r') as utterance_file:
    utterances = [line.strip() for line in utterance_file]

# Combine slots and intents into a list of tuples
dataset = list(zip(utterances, slots, intents))

In [5]:
# Set threshold for O ratio
o_ratio_threshold = 0.5  # Skip utterances with O ratio higher than this value

# Step 1: Compute min and max lengths for normalization
# lengths = [len(utterance_slots) for utterance_slots, _ in dataset]
lengths = [len(utterance_slots) for _, utterance_slots, _ in dataset]
min_length = min(lengths)
max_length = max(lengths)

# Step 2: Pre-filter utterances based on O ratio and compute combined score
filtered_utterances = []
for utterance, utterance_slots, utterance_intent in dataset:
    total_tokens = len(utterance_slots)
    o_count = utterance_slots.count('O')
    o_ratio = o_count / total_tokens

    # Filter out utterances with high O ratio
    if o_ratio > o_ratio_threshold:
        continue

    # Normalize length score based on min and max length
    # length_score = (total_tokens - o_count) / total_tokens  # Length with minimal O's prioritized
    # normalized_length_score = (length_score - (min_length / max_length)) / (1 - (min_length / max_length))

    # # Combined score based on normalized length
    # combined_score = normalized_length_score

    length_score = (total_tokens - min_length) / (max_length - min_length + 1e-9)  # Normalize to [0,1]

    # Combined score considering both length and O-ratio
    combined_score = length_score * (1 - o_ratio)

    # Append the filtered and scored utterance
    # filtered_utterances.append((utterance_slots, utterance_intent, combined_score))
    filtered_utterances.append((utterance, utterance_slots, utterance_intent, combined_score))

# Step 3: Sort by combined score (length and O-ratio)
sorted_utterances = sorted(filtered_utterances, key=lambda x: x[3], reverse=True)

# Step 4: Apply diversity selection
selected_utterances = []
selected_intents = set()
selected_slots = set()

k = 5 # Set number of top examples to select

for utterance, utterance_slots, utterance_intent, combined_score in sorted_utterances:
    unique_slots = set(slot for slot in utterance_slots if slot != 'O')

    # Separate checks for intent and slot diversity
    add_for_intent = utterance_intent not in selected_intents
    add_for_slots = any(slot not in selected_slots for slot in unique_slots)
    # print(add_for_intent, add_for_slots)
    # Add if it increases diversity for intents or slots, or both
    if add_for_intent: #or add_for_slots
        # selected_utterances.append((utterance_slots, utterance_intent, combined_score))
        selected_utterances.append((utterance, utterance_slots, utterance_intent, combined_score))
        selected_intents.add(utterance_intent)
        selected_slots.update(unique_slots)

    # Stop once we've reached the desired number
    if len(selected_utterances) == k:
        break

# Step 5: Handle cases where diversity criteria can't fill k
# if len(selected_utterances) < k:
#     # Remove already-selected utterances from the sorted list
#     remaining_utterances = [
#         (utterance, utterance_slots, utterance_intent, combined_score)
#         for utterance, utterance_slots, utterance_intent, combined_score in sorted_utterances
#         if (utterance, utterance_slots, utterance_intent, combined_score) not in selected_utterances
#     ]

#     # Add the remaining top-scoring examples to meet the k requirement
#     for utterance, utterance_slots, utterance_intent, combined_score in remaining_utterances:
#         selected_utterances.append((utterance, utterance_slots, utterance_intent, combined_score))
#         if len(selected_utterances) == k:
#             break


# Output the selected utterances
print("\nSelected top-k utterances based on combined scoring and diversity:")
for idx, (utterance, utterance_slots, utterance_intent, score) in enumerate(selected_utterances):
    print(f"Rank {idx + 1}:")
    print(f"Utterance: {utterance}")
    print(f"Slots: {utterance_slots}")
    print(f"Intent: {utterance_intent}")
    print(f"Combined score: {score:.4f}")
    print("---")