### Step 1: Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, log_loss
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline
from scipy.stats import spearmanr
import torch.nn.functional as F
import json
from collections import Counter
import nltk
import stanza
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from stanza.utils.conll import CoNLL

In [2]:
# Download required NLTK and Stanza models
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('en')
stanza.download('de')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-03 15:42:45 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-03 15:42:45 INFO: Downloading default packages for language: en (English) ...
2024-11-03 15:42:46 INFO: File exists: C:\Users\Admin\stanza_resources\en\default.zip
2024-11-03 15:42:49 INFO: Finished downloading models and saved to C:\Users\Admin\stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-03 15:42:50 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-03 15:42:50 INFO: Downloading default packages for language: de (German) ...
2024-11-03 15:42:51 INFO: File exists: C:\Users\Admin\stanza_resources\de\default.zip
2024-11-03 15:42:55 INFO: Finished downloading models and saved to C:\Users\Admin\stanza_resources


### Step 2: Load Data

In [3]:
# Load the dataset
train_df = pd.read_json('train.model-agnostic.json')

# Display the first few rows of the dataset
train_df.head()

Unnamed: 0,hyp,tgt,src,ref,task,model
0,"Don't worry, it's only temporary.",Don't worry. It's only temporary.,Не волнуйся. Это только временно.,either,MT,
1,Tom is never where he should be.,Tom is never where he's supposed to be.,"Тома никогда нет там, где он должен быть.",either,MT,
2,It's hard for me to work with Tom.,I have trouble working with Tom.,Мне сложно работать с Томом.,either,MT,
3,"Water, please.",I'd like some water.,"Воду, пожалуйста.",either,MT,
4,I didn't expect Tom to betray me.,I didn't think that Tom would betray me.,"Я не ожидал, что Том предаст меня.",either,MT,


In [4]:
# Load the dataset
train_dfx = pd.read_json('train.model-aware.v2.json')

# Display the first few rows of the dataset
train_dfx.head()

Unnamed: 0,hyp,tgt,src,ref,task,model
0,Of or pertaining to the language of a particul...,"Of or pertaining to everyday language , as opp...","There are blacktips , silvertips , bronze whal...",tgt,DM,ltg/flan-t5-definition-en-base
1,Not coercive ; not involving coercion,Not coercive ; free of coercion,Mr. obama signed executive orders requiring al...,tgt,DM,ltg/flan-t5-definition-en-base
2,To express or express by words ; to express by...,To depict or portray .,Disloyal ? / the word is too good to paint out...,tgt,DM,ltg/flan-t5-definition-en-base
3,Having the power to authoritatively speak or w...,Having a commanding style .,"He instructed us in that booming , authoritati...",tgt,DM,ltg/flan-t5-definition-en-base
4,Without a scot .,"Without consequences or penalties , to go free...",To get off scot-free . What is the meaning of ...,tgt,DM,ltg/flan-t5-definition-en-base


### Step 3: Exploratory Data Analysis 

In [4]:
# Check for missing values
print(train_df.isnull().sum())

# Get unique values in categorical columns
print("Unique values in 'task' column:", train_df['task'].unique())
print("Unique values in 'ref' column:", train_df['ref'].unique())
print("Unique values in 'model' column:", train_df['model'].unique())

hyp      0
tgt      0
src      0
ref      0
task     0
model    0
dtype: int64
Unique values in 'task' column: ['MT' 'DM' 'PG']
Unique values in 'ref' column: ['either' 'tgt' 'src']
Unique values in 'model' column: ['']


In [5]:
# Check data distribution in the 'task' column
print("Task Distribution:\n", train_df['task'].value_counts())

# Look at text length distributions for hyp, tgt, and src columns
train_df['hyp_length'] = train_df['hyp'].apply(lambda x: len(x.split()))
train_df['tgt_length'] = train_df['tgt'].apply(lambda x: len(x.split()))
train_df['src_length'] = train_df['src'].apply(lambda x: len(x.split()))

# Summary statistics of lengths
print("Hypothesis Text Length Stats:\n", train_df['hyp_length'].describe())
print("Target Text Length Stats:\n", train_df['tgt_length'].describe())
print("Source Text Length Stats:\n", train_df['src_length'].describe())

Task Distribution:
 MT    10000
PG    10000
DM    10000
Name: task, dtype: int64
Hypothesis Text Length Stats:
 count    30000.000000
mean         5.570867
std          3.401798
min          1.000000
25%          4.000000
50%          5.000000
75%          7.000000
max        108.000000
Name: hyp_length, dtype: float64
Target Text Length Stats:
 count    30000.000000
mean         5.325933
std          6.035543
min          0.000000
25%          0.000000
50%          5.000000
75%          8.000000
max         89.000000
Name: tgt_length, dtype: float64
Source Text Length Stats:
 count    30000.000000
mean        14.673300
std         19.259339
min          1.000000
25%          4.000000
50%          5.000000
75%         21.000000
max        457.000000
Name: src_length, dtype: float64


In [8]:
# Group by model and list unique reference types for each model
model_reference_usage = train_df.groupby('task')['ref'].unique().reset_index()

# Display reference types used by each model
print("Reference types used by each task:")
print(model_reference_usage)

Reference types used by each model:
  task       ref
0   DM     [tgt]
1   MT  [either]
2   PG     [src]


Based on the results, we will apply preprocessing steps accordingly to the specific task requirements: for **Definition Modeling (DM)** tasks, we will focus on the target (`tgt`) as the main reference, for **Paraphrase Generation (PG)** tasks, we will use the source (`src`) as the reference, and for **Machine Translation (MT)** tasks, we can consider either the source or the target as references, but for our analysis we will continue with (`src`).

### Step 4: Data Preprocessing 

#### Tokenization and Text Normalization

In [6]:
# Tokenize the text and count word frequencies
words = [word.lower() for text in train_df['hyp'] for word in word_tokenize(text)]
word_counts = Counter(words)
print(word_counts.most_common(20))

[('.', 23923), ('a', 7651), ('(', 7035), (')', 7034), (',', 6650), ('to', 6608), ('of', 6476), ('the', 5871), ('you', 4853), ('i', 4679), ('?', 4486), ("'s", 3047), ('tom', 2370), ('or', 2217), ('in', 2161), ("n't", 2109), ('do', 1924), ('it', 1844), ('is', 1586), ('that', 1489)]


Word frequency analysis reveals that punctuation and common function words dominate the top results. These typically don't contribute much meaningful information for analysis in tasks like hallucination detection. So 
Remove Punctuatio and 
Remove Stopworh.


In [7]:
# Tokenization and Text Normalization
def tokenize_and_normalize(text):
    tokens = word_tokenize(text.lower())  # Lowercasing
    tokens = [t for t in tokens if t.isalpha()]  # Removing punctuation (fshije)
    tokens = [t for t in tokens if t not in stopwords.words('english')]  # Removing stopwords (fshije)
    return tokens

train_df['hyp_tokens'] = train_df['hyp'].apply(tokenize_and_normalize)
train_df['tgt_tokens'] = train_df['tgt'].apply(tokenize_and_normalize)
train_df['src_tokens'] = train_df['src'].apply(tokenize_and_normalize)

In [8]:
# Token frequency analysis
hyp_word_counts = Counter([word for tokens in train_df['hyp_tokens'] for word in tokens])
print(hyp_word_counts.most_common(10))

[('tom', 2370), ('transitive', 899), ('pertaining', 834), ('informal', 738), ('one', 670), ('form', 649), ('know', 601), ('alternative', 587), ('obsolete', 570), ('something', 550)]


In [9]:
# Token frequency analysis
tgr_word_counts = Counter([word for tokens in train_df['tgt_tokens'] for word in tokens])
print(tgr_word_counts.most_common(10))

[('tom', 2506), ('one', 802), ('rare', 510), ('slang', 445), ('obsolete', 421), ('like', 416), ('transitive', 412), ('something', 403), ('form', 390), ('mary', 371)]


In [10]:
# Token frequency analysis
src_word_counts = Counter([word for tokens in train_df['src_tokens'] for word in tokens])
print(src_word_counts.most_common(10))

[('define', 10008), ('не', 2228), ('я', 2027), ('том', 1682), ('что', 1182), ('в', 1139), ('one', 973), ('это', 923), ('ты', 817), ('на', 758)]


In [11]:
train_df.head()

Unnamed: 0,hyp,tgt,src,ref,task,model,hyp_length,tgt_length,src_length,hyp_tokens,tgt_tokens,src_tokens
0,"Don't worry, it's only temporary.",Don't worry. It's only temporary.,Не волнуйся. Это только временно.,either,MT,,5,5,5,"[worry, temporary]","[worry, temporary]","[не, волнуйся, это, только, временно]"
1,Tom is never where he should be.,Tom is never where he's supposed to be.,"Тома никогда нет там, где он должен быть.",either,MT,,7,8,8,"[tom, never]","[tom, never, supposed]","[тома, никогда, нет, там, где, он, должен, быть]"
2,It's hard for me to work with Tom.,I have trouble working with Tom.,Мне сложно работать с Томом.,either,MT,,8,6,5,"[hard, work, tom]","[trouble, working, tom]","[мне, сложно, работать, с, томом]"
3,"Water, please.",I'd like some water.,"Воду, пожалуйста.",either,MT,,2,4,2,"[water, please]","[like, water]","[воду, пожалуйста]"
4,I didn't expect Tom to betray me.,I didn't think that Tom would betray me.,"Я не ожидал, что Том предаст меня.",either,MT,,7,8,7,"[expect, tom, betray]","[think, tom, would, betray]","[я, не, ожидал, что, том, предаст, меня]"


#### Search for Specific Patterns

In [12]:
# Search for Specific Patterns
def search_text_column(pattern, data, column='hyp'):
    return [row[column] for idx, row in data.iterrows() if re.search(pattern, row[column])]

# Example: Find rows in 'hyp' containing numbers
hyp_with_numbers = search_text_column(r'\d+', train_df, column='hyp')

# Pattern matching using regular expressions
def search_pattern(pattern, column, data=train_df):
    matches = []
    for _, row in data.iterrows():
        match = re.search(pattern, row[column])
        if match:
            matches.append(row[column])
    return matches

# Example pattern search
matches = search_pattern(r'\b(seven|7)\b', 'hyp')
print("Examples with the number seven:", matches[:5])

Examples with the number seven: ['He was born at 7 a.m. on June 5, 1970.', 'The population of Hong Kong is more than seven million people.', 'In 1951 at the Palace of Soviet Pioneers, British international master Robert Wade held a session of simultaneous play with 30 local children under 14 years old. After seven hours of MI Wade game, he managed to make 10 draws, losing the remaining 20 matches.', 'My mother had seven sons and four daughters, and she had five sisters.', 'My mother had seven sons and four daughters, and she had five sisters.']


#### Length Analysis 

In [13]:
# Length Analysis (outlier detection)
train_df['hyp_length'] = train_df['hyp'].apply(lambda x: len(x.split()))
train_df['src_length'] = train_df['src'].apply(lambda x: len(x.split()))

#### Lemmatization

In [14]:
# Lemmatization using Stanza
nlp = stanza.Pipeline('en', processors='tokenize,lemma')

def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [word.lemma for sentence in doc.sentences for word in sentence.words]
    return lemmas

2024-11-03 15:45:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-03 15:45:38 INFO: Downloaded file to C:\Users\Admin\stanza_resources\resources.json
2024-11-03 15:45:38 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-11-03 15:45:38 INFO: Using device: cpu
2024-11-03 15:45:38 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-03 15:45:38 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-03 15:45:38 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-03 15:45:38 INFO: Done loading processors!


In [15]:
train_df['hyp_lemmas'] = train_df['hyp'].apply(lambda x: ' '.join(lemmatize_text(x)))

In [16]:
train_df['tgt_lemmas'] = train_df['tgt'].apply(lambda x: ' '.join(lemmatize_text(x)))

In [17]:
# Apply lemmatization to the 'src' column only if the task is not 'ML'
train_df['src_lemmas'] = train_df.apply(lambda row: ' '.join(lemmatize_text(row['src'])) if row['task'] != 'ML' else row['src'], axis=1)

In [18]:
train_df.head()

Unnamed: 0,hyp,tgt,src,ref,task,model,hyp_length,tgt_length,src_length,hyp_tokens,tgt_tokens,src_tokens,hyp_lemmas,tgt_lemmas,src_lemmas
0,"Don't worry, it's only temporary.",Don't worry. It's only temporary.,Не волнуйся. Это только временно.,either,MT,,5,5,5,"[worry, temporary]","[worry, temporary]","[не, волнуйся, это, только, временно]","do not worry , it 's only temporary .",do not worry . it 's only temporary .,Нi deлнуeсy . Это eeлькe deемеeнy .
1,Tom is never where he should be.,Tom is never where he's supposed to be.,"Тома никогда нет там, где он должен быть.",either,MT,,7,8,8,"[tom, never]","[tom, never, supposed]","[тома, никогда, нет, там, где, он, должен, быть]",Tom be never where he should be .,Tom be never where he 's suppose to be .,"Тома eик нет там , где оi eeлжеe быть ."
2,It's hard for me to work with Tom.,I have trouble working with Tom.,Мне сложно работать с Томом.,either,MT,,8,6,5,"[hard, work, tom]","[trouble, working, tom]","[мне, сложно, работать, с, томом]",it 's hard for I to work with Tom .,I have trouble work with Tom .,Мне eeожнe deботeтy с Томом .
3,"Water, please.",I'd like some water.,"Воду, пожалуйста.",either,MT,,2,4,2,"[water, please]","[like, water]","[воду, пожалуйста]","water , please .",I would like some water .,"Воду , dожаeуйсeа ."
4,I didn't expect Tom to betray me.,I didn't think that Tom would betray me.,"Я не ожидал, что Том предаст меня.",either,MT,,7,8,7,"[expect, tom, betray]","[think, tom, would, betray]","[я, не, ожидал, что, том, предаст, меня]",I do not expect Tom to betray I .,I do not think that Tom would betray I .,"Я нi eeидаe , что Том eре меня ."


Optional Columns:
- hyp_length, src_length 
- tgt_tokens, tgt_lemmas