In [1]:
# Required Imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
import stanza
import re
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

# Load Dataset
train_df = pd.read_json('train.model-agnostic.json')

# Select rows where the 'task' column is either 'MT' or 'PG'
filtered_df = train_df[train_df['task'].isin(['PG'])]

# Reset the index 
filtered_df = filtered_df.reset_index(drop=True)

In [2]:
# Load Stanza NLP models
stanza.download('en')
nlp_en = stanza.Pipeline('en', processors='tokenize,lemma')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-07 16:56:02 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-07 16:56:02 INFO: Downloading default packages for language: en (English) ...
2024-11-07 16:56:03 INFO: File exists: C:\Users\Admin\stanza_resources\en\default.zip
2024-11-07 16:56:06 INFO: Finished downloading models and saved to C:\Users\Admin\stanza_resources
2024-11-07 16:56:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-07 16:56:06 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-07 16:56:06 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-11-07 16:56:06 INFO: Using device: cpu
2024-11-07 16:56:06 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-07 16:56:07 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-07 16:56:07 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-07 16:56:07 INFO: Done loading processors!


In [6]:

# Text Segmentation and Normalization
def normalize_text(text):
    # Expand contractions (e.g., "you're" to "you are")
    expanded_text = contractions.fix(text)
    # Lowercase the expanded text
    expanded_text = expanded_text.lower()  
    # Remove punctuation
    expanded_text = re.sub(r'[^\w\s]', '', expanded_text)
    return expanded_text

# Apply normalization
filtered_df['hyp_normalized'] = filtered_df['hyp'].apply(normalize_text)
filtered_df['src_normalized'] = filtered_df['src'].apply(normalize_text)

# Sentence Segmentation
filtered_df['hyp_sentences'] = filtered_df['hyp_normalized'].apply(sent_tokenize)
filtered_df['src_sentences'] = filtered_df['src_normalized'].apply(sent_tokenize)

# Tokenization
filtered_df['hyp_tokens'] = filtered_df['hyp_sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
filtered_df['src_tokens'] = filtered_df['src_sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

# Lemmatization
def preprocess_text(text):
    doc = nlp_en(text)  
    lemmas = [word.lemma for sentence in doc.sentences for word in sentence.words]
    return lemmas

# Apply lemmatization 
filtered_df['hyp_lemmas'] = filtered_df['hyp_normalized'].apply(preprocess_text)
filtered_df['src_lemmas'] = filtered_df['src_normalized'].apply(preprocess_text)

### Columns to Keep
1. **hyp** and **src**: Original texts, important as the base for comparison.
2. **hyp_normalized** and **src_normalized**: Lowercased and punctuation-removed text, useful for a quick comparison without worrying about case sensitivity or punctuation.
3. **hyp_lemmas** and **src_lemmas**: Lemmatized versions of the text, useful for semantic comparisons and to see if any new concepts are introduced in `hyp` that weren’t in `src`.

### Optional Columns
1. **hyp_sentences** and **src_sentences**: If the data typically consists of multiple sentences per row, these could be helpful for sentence-level comparison. However, if the text is mostly single-sentence, they might be redundant.
2. **hyp_tokens** and **src_tokens**: These columns are useful if you plan to do token-level analysis, such as checking for specific word overlaps. If your focus is mainly on lemmas (which are conceptually higher-level), you may not need tokens.


In [7]:
# Join lemmas back into text format for each row only within the similarity analysis functions
hyp_lemmas_text = filtered_df['hyp_lemmas'].apply(lambda x: ' '.join(x))
src_lemmas_text = filtered_df['src_lemmas'].apply(lambda x: ' '.join(x))

# Similarity Analysis
# Cosine Similarity using TF-IDF
vectorizer = TfidfVectorizer()
hyp_tfidf = vectorizer.fit_transform(hyp_lemmas_text)
src_tfidf = vectorizer.transform(src_lemmas_text)
filtered_df['cosine_similarity'] = cosine_similarity(hyp_tfidf, src_tfidf).diagonal()

# Semantic Similarity using Sentence Transformers
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
src_embeddings = model.encode(src_lemmas_text.tolist(), convert_to_tensor=True)
hyp_embeddings = model.encode(hyp_lemmas_text.tolist(), convert_to_tensor=True)
filtered_df['semantic_similarity'] = [sim.item() for sim in util.pytorch_cos_sim(src_embeddings, hyp_embeddings).diag()]

# Display results
filtered_df.head()

Unnamed: 0,hyp,tgt,src,ref,task,model,hyp_normalized,src_normalized,hyp_sentences,src_sentences,hyp_tokens,src_tokens,hyp_lemmas,src_lemmas,cosine_similarity,semantic_similarity
0,"You're not alone, claire- -",,"You're not alone, Claire.",src,PG,,you are not alone claire,you are not alone claire,[you are not alone claire],[you are not alone claire],"[[you, are, not, alone, claire]]","[[you, are, not, alone, claire]]","[you, be, not, alone, claire]","[you, be, not, alone, claire]",1.0,1.0
1,"Who told you to throw acid at Vargas, hmmm?",,"Who told you to throw acid at Vargas, hmm?",src,PG,,who told you to throw acid at vargas hmmm,who told you to throw acid at vargas hmm,[who told you to throw acid at vargas hmmm],[who told you to throw acid at vargas hmm],"[[who, told, you, to, throw, acid, at, vargas,...","[[who, told, you, to, throw, acid, at, vargas,...","[who, tell, you, to, throw, acid, at, vargas, ...","[who, tell, you, to, throw, acid, at, vargas, ...",1.0,1.0
2,♪ Where the pure angel merges with the antic s...,,Where the pure angel merges with the antic Sphinx,src,PG,,where the pure angel merges with the antic sp...,where the pure angel merges with the antic sphinx,[ where the pure angel merges with the antic s...,[where the pure angel merges with the antic sp...,"[[where, the, pure, angel, merges, with, the, ...","[[where, the, pure, angel, merges, with, the, ...","[where, the, pure, angel, merge, with, the, an...","[where, the, pure, angel, merge, with, the, an...",1.0,1.0
3,Where is it written what is it I'm meant to be?,,Where is it written what is it I'm meant to be,src,PG,,where is it written what is it i am meant to be,where is it written what is it i am meant to be,[where is it written what is it i am meant to be],[where is it written what is it i am meant to be],"[[where, is, it, written, what, is, it, i, am,...","[[where, is, it, written, what, is, it, i, am,...","[where, be, it, write, what, be, it, I, be, me...","[where, be, it, write, what, be, it, I, be, me...",1.0,1.0
4,We'll find the skipper and then we'll go home.,,We'll find the skipper and then we'll go home.,src,PG,,we will find the skipper and then we will go home,we will find the skipper and then we will go home,[we will find the skipper and then we will go ...,[we will find the skipper and then we will go ...,"[[we, will, find, the, skipper, and, then, we,...","[[we, will, find, the, skipper, and, then, we,...","[we, will, find, the, skipper, and, then, we, ...","[we, will, find, the, skipper, and, then, we, ...",1.0,1.0


In [8]:
# Filter rows where cosine_similarity is not 1.0
filtered_non_one_similarity = filtered_df[filtered_df['cosine_similarity'] < 1]

# Display the filtered rows
filtered_non_one_similarity

Unnamed: 0,hyp,tgt,src,ref,task,model,hyp_normalized,src_normalized,hyp_sentences,src_sentences,hyp_tokens,src_tokens,hyp_lemmas,src_lemmas,cosine_similarity,semantic_similarity
5,Seymour's Darling is the third... and little A...,,Seymour's Darling is third... and little Arnie...,src,PG,,seymours darling is the third and little arnie...,seymours darling is third and little arnie mov...,[seymours darling is the third and little arni...,[seymours darling is third and little arnie mo...,"[[seymours, darling, is, the, third, and, litt...","[[seymours, darling, is, third, and, little, a...","[seymour, darling, be, the, third, and, little...","[seymour, darling, be, third, and, little, arn...",0.798683,0.896277
6,"- Scud, do you read me, please?",,"Scud, do you read me?",src,PG,,scud do you read me please,scud do you read me,[ scud do you read me please],[scud do you read me],"[[scud, do, you, read, me, please]]","[[scud, do, you, read, me]]","[scud, do, you, read, I, please]","[scud, do, you, read, I]",0.900309,0.902591
9,¿Mabel's a slave?,,Is Mabel a slave?,src,PG,,mabels a slave,is mabel a slave,[mabels a slave],[is mabel a slave],"[[mabels, a, slave]]","[[is, mabel, a, slave]]","[mabel, a, slave]","[be, mabel, a, slave]",0.989234,0.897224
12,Homicide investigators have told me that they ...,,Homicide investigators have told me that they ...,src,PG,,homicide investigators have told me that they ...,homicide investigators have told me that they ...,[homicide investigators have told me that they...,[homicide investigators have told me that they...,"[[homicide, investigators, have, told, me, tha...","[[homicide, investigators, have, told, me, tha...","[homicide, investigator, have, tell, I, that, ...","[homicide, investigator, have, tell, I, that, ...",0.991717,0.998738
13,"Oh, he's an In-Valid, sir.",,"He's an In-Valid, sir.",src,PG,,oh he is an invalid sir,he is an invalid sir,[oh he is an invalid sir],[he is an invalid sir],"[[oh, he, is, an, invalid, sir]]","[[he, is, an, invalid, sir]]","[oh, he, be, a, invalid, sir]","[he, be, a, invalid, sir]",0.886771,0.897194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,⁇ in?,,Are you coming in?,src,PG,,in,are you coming in,[ in],[are you coming in],[[in]],"[[are, you, coming, in]]",[in],"[be, you, come, in]",0.601023,0.602633
9991,- What'd he be seeing?,,What did he see?,src,PG,,what did he be seeing,what did he see,[ what did he be seeing],[what did he see],"[[what, did, he, be, seeing]]","[[what, did, he, see]]","[what, do, he, be, see]","[what, do, he, see]",0.972964,0.987963
9994,I-I don't know who that guy is.,,I don't know who that is.,src,PG,,ii do not know who that guy is,i do not know who that is,[ii do not know who that guy is],[i do not know who that is],"[[ii, do, not, know, who, that, guy, is]]","[[i, do, not, know, who, that, is]]","[ii, do, not, know, who, that, guy, be]","[I, do, not, know, who, that, be]",0.735315,0.863496
9996,Time?,,The time?,src,PG,,time,the time,[time],[the time],[[time]],"[[the, time]]",[time],"[the, time]",0.829122,0.962874


In [9]:
filtered_df.to_csv('filtered_df.csv')
filtered_non_one_similarity.to_csv('filtered_non_one_similarity.csv')