In [None]:
#step 10: perturbate data for robustness study
import random
import re
def perturbate_dev_data(dev_data):
    def add_typo(sentence):
        """Introduce typos by replacing or swapping characters in a word."""
        words = sentence.split()
        if not words:
            return sentence
        word_idx = random.randint(0, len(words) - 1)
        char_idx = random.randint(0, len(words[word_idx]) - 1)
        perturbed_word = (
            words[word_idx][:char_idx]
            + random.choice('abcdefghijklmnopqrstuvwxyz')
            + words[word_idx][char_idx + 1:]
        )
        words[word_idx] = perturbed_word
        return ' '.join(words)

    def replace_with_synonym(sentence, synonyms_dict):
        """Replace some words with synonyms from a given dictionary."""
        words = sentence.split()
        return ' '.join([synonyms_dict.get(word, word) for word in words])

    def introduce_grammar_error(sentence):
        """Introduce grammar errors by modifying verb forms or tenses."""
        return re.sub(r'\bis\b', 'are', sentence)

    def add_random_noise(sentence):
        """Insert random noise words into the sentence."""
        words = sentence.split()
        noise_words = ['xx', 'yy', 'zz', random.choice('abcdefghijklmnopqrstuvwxyz')]
        insert_pos = random.randint(0, len(words))
        words.insert(insert_pos, random.choice(noise_words))
        return ' '.join(words)

    synonyms_dict = {
        "board": "plank",
        "circulate": "distribute",
        "hook": "catch",
        "recreation": "leisure",
        "domesticity": "homeliness",
        "acquisition": "purchase",
        "meeting": "gathering",
        "nude": "bare",
        "mark": "impression",
        "association": "connection",
        "inclination": "tendency",
        "glaze": "coat",
        "piggyback": "carry",
        "pick": "choose",
        "lecture": "talk",
        "bondage": "captivity",
    }


    perturbed_dev = dev_data.copy()
    perturbed_dev[3] = perturbed_dev[3].apply(
        lambda x: add_typo(introduce_grammar_error(replace_with_synonym(x, synonyms_dict)))
    )
    perturbed_dev[4] = perturbed_dev[4].apply(
        lambda x: add_random_noise(introduce_grammar_error(add_typo(x)))
    )

    return perturbed_dev


In [None]:
# Perturb the dev dataset
perturbed_dev_data = perturbate_dev_data(dev_data2)

# Display a sample of the perturbed dev data
print(perturbed_dev_data.info())
perturbed_dev_data.head(15)



In [None]:
perturbed_dev_data['log2freq1'] = perturbed_dev_data[3].apply(lambda x: np.mean([
    np.log2(record.get((token, 'NOUN', 'NON_STOP'), [1])[0])
    for token in gensim.utils.simple_preprocess(x, min_len=2)
]))

perturbed_dev_data['log2freq2'] = perturbed_dev_data[4].apply(lambda x: np.mean([
    np.log2(record.get((token, 'NOUN', 'NON_STOP'), [1])[0])
    for token in gensim.utils.simple_preprocess(x, min_len=2)
]))

perturbed_dev_data['is_stop_word1'] = perturbed_dev_data[3].apply(lambda x: any(
    token in stop_words for token in gensim.utils.simple_preprocess(x, min_len=2)
))

perturbed_dev_data['is_stop_word2'] = perturbed_dev_data[4].apply(lambda x: any(
    token in stop_words for token in gensim.utils.simple_preprocess(x, min_len=2)
))
perturbed_dev_data['emb1'] = perturbed_dev_data[3].apply(lambda x: emb(x))
perturbed_dev_data['emb2'] = perturbed_dev_data[4].apply(lambda x: emb(x))

In [None]:
perturbed_dev_data['cosine_similarity_discounted'] = perturbed_dev_data.apply(
    lambda row: discounted_cosine_similarity(
        row['emb1'], row['emb2'], row['log2freq1'], row['log2freq2'],
        row['is_stop_word1'], row['is_stop_word2'], best_parameters
    ),
    axis=1
)


In [None]:
perturbed_dev_data['predictions'] = perturbed_dev_data['cosine_similarity_discounted'].apply(
    lambda x: 'T' if x >= best_parameters['threshold'] else 'F'
)

from sklearn.metrics import accuracy_score
perturbed_accuracy = accuracy_score(dev_gold_labels, perturbed_dev_data['predictions'])
print(f"Discounted Accuracy on Perturbed Dev Set: {perturbed_accuracy:.2%}")


In [None]:
success_cases = perturbed_dev_data[perturbed_dev_data['predictions'] == dev_gold_labels]
failure_cases = perturbed_dev_data[perturbed_dev_data['predictions'] != dev_gold_labels]

print("Examples of Success Cases:")
print(success_cases.head(2))  # Display 2 success cases

print("\nExamples of Failure Cases:")
print(failure_cases.head(2))  # Display 2 failure cases


In [None]:
perturbed_recall = recall_score(dev_gold_labels, perturbed_dev_data['predictions'], pos_label='T')
perturbed_f1 = f1_score(dev_gold_labels, perturbed_dev_data['predictions'], pos_label='T')

print("Perturbed Dev Set Metrics:")
print(f"Recall: {perturbed_recall:.2%}")
print(f"F1-Score: {perturbed_f1:.2%}\n")


# Confusion Matrix for Perturbed Dev Set
cm_perturbed = confusion_matrix(dev_gold_labels, perturbed_dev_data['predictions'], labels=['T', 'F'])  # Use actual labels
disp_perturbed = ConfusionMatrixDisplay(confusion_matrix=cm_perturbed, display_labels=['T (Similar)', 'F (Different)'])
disp_perturbed.plot(cmap='Oranges')
plt.title('Confusion Matrix (Perturbed Dev Set)')
plt.show()
