In [31]:
# Required Imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
import stanza
import re
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util


# Load Datasets
train_df = pd.read_json('dataset/train.model-agnostic.json')
test_df = pd.read_json('dataset/test.model-agnostic.json')
val_df = pd.read_json('dataset/val.model-agnostic.json')

# Filter rows for specific tasks in each dataset
train_df = train_df[train_df['task'].isin(['PG'])]
# Remove unnecessary columns for preprocessing
train_df = train_df.drop(columns=['tgt', 'model', 'ref', 'task'])

test_df = test_df[test_df['task'].isin(['PG'])]
# Remove unnecessary columns for preprocessing
test_df = test_df.drop(columns=['tgt', 'task', 'labels', 'label', 'p(Hallucination)', 'id'])

val_df = val_df[val_df['task'].isin(['PG'])]
# Remove unnecessary columns for preprocessing
val_df = val_df.drop(columns=['tgt', 'model', 'ref', 'task', 'labels', 'label', 'p(Hallucination)'])

# Reset indices for easier handling
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [32]:
train_df

Unnamed: 0,hyp,src
0,"You're not alone, claire- -","You're not alone, Claire."
1,"Who told you to throw acid at Vargas, hmmm?","Who told you to throw acid at Vargas, hmm?"
2,♪ Where the pure angel merges with the antic s...,Where the pure angel merges with the antic Sphinx
3,Where is it written what is it I'm meant to be?,Where is it written what is it I'm meant to be
4,We'll find the skipper and then we'll go home.,We'll find the skipper and then we'll go home.
...,...,...
9995,"Yeah, I'm listening.","Yeah, I'm listening."
9996,Time?,The time?
9997,Plague?,A plague?
9998,"Tango, Tango.",Tango.


In [24]:
test_df

Unnamed: 0,src,hyp
0,Here she comes.,"Here she comes, here she comes."
1,Everything will be allright.,Everythings fine.
2,I'm not familiar with who that is.,I am unfamiliar with who that is.
3,It's turning me into a crazy person.,It turns me into madness.
4,I'm joking.,I'm--I'm joking.
...,...,...
370,What's the rush?,Wh-What's the rush?
371,"Fortunately, I got a plan.","But luckily, I've got a plan."
372,I'll get my things.,I'll get my trinkets.
373,Can anyone back you up on that?,Can anyone corroborate that?


In [25]:
val_df

Unnamed: 0,hyp,src
0,I have not been contacted.,I haven't been contacted by anybody.
1,I thought you'd be surprised at me too.,"I thought so, too."
2,Is she gonna be okay?,Is she gonna be okay?
3,How long before you're making that happen?,How long before you make that happen?
4,You've got a customer.,You've got a client.
...,...,...
120,"We don't have the money to risk it, all right?",We can't afford to risk it.
121,"Uh, just for a couple of days.","Eh, just a few days."
122,I'm not in any of this at all.,I'm not involved in this.
123,"Just breathe deep, and I'll be right there.",Just breathe deep.


In [6]:
# Load Stanza NLP models
stanza.download('en')
nlp_en = stanza.Pipeline('en', processors='tokenize,lemma')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-08 12:17:00 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-08 12:17:00 INFO: Downloading default packages for language: en (English) ...
2024-11-08 12:17:01 INFO: File exists: C:\Users\Admin\stanza_resources\en\default.zip
2024-11-08 12:17:03 INFO: Finished downloading models and saved to C:\Users\Admin\stanza_resources
2024-11-08 12:17:03 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-08 12:17:04 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-08 12:17:04 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-11-08 12:17:04 INFO: Using device: cpu
2024-11-08 12:17:04 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-08 12:17:04 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-08 12:17:04 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-08 12:17:04 INFO: Done loading processors!


In [26]:
# Text Segmentation and Normalization Function
def normalize_text(text):
    expanded_text = contractions.fix(text).lower()  # Expand contractions and lowercase
    return re.sub(r'[^\w\s]', '', expanded_text)  # Remove punctuation

# Lemmatization Function
def preprocess_text(text):
    doc = nlp_en(text)
    return [word.lemma for sentence in doc.sentences for word in sentence.words]

# Apply Preprocessing Steps to Dataset
def preprocess_dataset(df):
    df['hyp_normalized'] = df['hyp'].apply(normalize_text)
    df['src_normalized'] = df['src'].apply(normalize_text)
    
    # Sentence Segmentation
    df['hyp_sentences'] = df['hyp_normalized'].apply(sent_tokenize)
    df['src_sentences'] = df['src_normalized'].apply(sent_tokenize)
    
    # Tokenization
    df['hyp_tokens'] = df['hyp_sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
    df['src_tokens'] = df['src_sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
    
    # Lemmatization
    df['hyp_lemmas'] = df['hyp_normalized'].apply(preprocess_text)
    df['src_lemmas'] = df['src_normalized'].apply(preprocess_text)
    
    return df

# Preprocess each dataset
train_df = preprocess_dataset(train_df)
test_df = preprocess_dataset(test_df)
val_df = preprocess_dataset(val_df)

### Columns to Keep
1. **hyp** and **src**: Original texts, important as the base for comparison.
2. **hyp_normalized** and **src_normalized**: Lowercased and punctuation-removed text, useful for a quick comparison without worrying about case sensitivity or punctuation.
3. **hyp_lemmas** and **src_lemmas**: Lemmatized versions of the text, useful for semantic comparisons and to see if any new concepts are introduced in `hyp` that weren’t in `src`.

### Optional Columns
1. **hyp_sentences** and **src_sentences**: If the data typically consists of multiple sentences per row, these could be helpful for sentence-level comparison. However, if the text is mostly single-sentence, they might be redundant.
2. **hyp_tokens** and **src_tokens**: These columns are useful if you plan to do token-level analysis, such as checking for specific word overlaps. If your focus is mainly on lemmas (which are conceptually higher-level), you may not need tokens.


In [27]:
# Function for Similarity Analysis
def calculate_similarity(df):
    # Join lemmas back into text format
    hyp_lemmas_text = df['hyp_lemmas'].apply(lambda x: ' '.join(x))
    src_lemmas_text = df['src_lemmas'].apply(lambda x: ' '.join(x))
    
    # Cosine Similarity using TF-IDF
    vectorizer = TfidfVectorizer()
    hyp_tfidf = vectorizer.fit_transform(hyp_lemmas_text)
    src_tfidf = vectorizer.transform(src_lemmas_text)
    df['cosine_similarity'] = cosine_similarity(hyp_tfidf, src_tfidf).diagonal()
    
    # Semantic Similarity using Sentence Transformers
    model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
    src_embeddings = model.encode(src_lemmas_text.tolist(), convert_to_tensor=True)
    hyp_embeddings = model.encode(hyp_lemmas_text.tolist(), convert_to_tensor=True)
    df['semantic_similarity'] = [sim.item() for sim in util.pytorch_cos_sim(src_embeddings, hyp_embeddings).diag()]
    
    return df

# Apply similarity calculations to each dataset
train_df = calculate_similarity(train_df)
test_df = calculate_similarity(test_df)
val_df = calculate_similarity(val_df)

In [28]:
# Display results for each dataset
print("Train Data:")
print(train_df.head())
print("\nTest Data:")
print(test_df.head())
print("\nValidation Data:")
print(val_df.head())

# Filter rows where cosine_similarity is not 1.0 for further analysis
filtered_train_non_one_similarity = train_df[train_df['semantic_similarity'] < 0.9]
filtered_test_non_one_similarity = test_df[test_df['semantic_similarity'] < 0.9]
filtered_val_non_one_similarity = val_df[val_df['semantic_similarity'] < 0.9]

# Display filtered rows
print("\nTrain Data (Cosine Similarity < 1):")
print(filtered_train_non_one_similarity.head())
print("\nTest Data (Cosine Similarity < 1):")
print(filtered_test_non_one_similarity.head())
print("\nValidation Data (Cosine Similarity < 1):")
print(filtered_val_non_one_similarity.head())

Train Data:
                                                 hyp  \
0                        You're not alone, claire- -   
1        Who told you to throw acid at Vargas, hmmm?   
2  ♪ Where the pure angel merges with the antic s...   
3    Where is it written what is it I'm meant to be?   
4     We'll find the skipper and then we'll go home.   

                                                 src  \
0                          You're not alone, Claire.   
1         Who told you to throw acid at Vargas, hmm?   
2  Where the pure angel merges with the antic Sphinx   
3     Where is it written what is it I'm meant to be   
4     We'll find the skipper and then we'll go home.   

                                      hyp_normalized  \
0                          you are not alone claire    
1          who told you to throw acid at vargas hmmm   
2   where the pure angel merges with the antic sp...   
3    where is it written what is it i am meant to be   
4  we will find the skipper and th

In [30]:
train_df.to_csv('generated_files/train_df.csv')
test_df.to_csv('generated_files/test_df.csv')
val_df.to_csv('generated_files/val_df.csv')
# filtered_train_non_one_similarity.to_csv('generated_files/filtered_train_non_one_similarity.csv')
# filtered_test_non_one_similarity.to_csv('generated_files/filtered_test_non_one_similarity.csv')
# filtered_val_non_one_similarity.to_csv('generated_files/filtered_val_non_one_similarity.csv')