In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# from transformers import AutoTokenizer, AutoModelForCausalLM
import torch  # Go to readme for installation instructions
import math

# Load datasets
original = pd.read_csv("intent_dataset.csv")
augmented = pd.read_csv("random_augmented_balanced_dataset.csv")

# Change 'text' below to the actual column name containing your sentences
original_texts = original['text'].dropna().tolist()
augmented_texts = augmented['text'].dropna().tolist()

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# model_name = "EleutherAI/gpt-j-6B"  # or try "EleutherAI/gpt-neo-2.7B"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

if torch.cuda.is_available():
    model = model.to('cuda')

def compute_perplexity(texts):
    perplexities = []
    for text in texts:
        # Encode text and get input IDs tensor
        encodings = tokenizer(text, return_tensors='pt')
        input_ids = encodings.input_ids
        if torch.cuda.is_available():
            input_ids = input_ids.to('cuda')
        
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            # outputs.loss is the average negative log likelihood per token
            neg_log_likelihood = outputs.loss.item() * input_ids.size(1)
        
        ppl = math.exp(neg_log_likelihood / input_ids.size(1))
        perplexities.append(ppl)
    return sum(perplexities) / len(perplexities)

# Compute average perplexity for original and augmented sets
print("Calculating perplexity for original dataset...")
ppl_original = compute_perplexity(original_texts)

print("Calculating perplexity for Original_augmented dataset...")
ppl_augmented = compute_perplexity(augmented_texts)

print(f"Average Perplexity - Original Dataset: {ppl_original:.2f}")
print(f"Average Perplexity - Original_Augmented Dataset: {ppl_augmented:.2f}")




Calculating perplexity for original dataset...
Calculating perplexity for Original_augmented dataset...
Average Perplexity - Original Dataset: 538.78
Average Perplexity - Original_Augmented Dataset: 852.99


In [8]:
from collections import Counter
import pandas as pd
import math
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Load your datasets (already loaded as 'original' and 'augmented')
original_texts = original['text'].dropna().tolist()
augmented_texts = augmented['text'].dropna().tolist()

# Helper: Extract n-grams (bigrams, trigrams, etc.)
def get_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# Helper: Convert text list to normalized n-gram distribution
def get_ngram_distribution(texts, n=2):
    ngram_counter = Counter()
    total = 0
    for text in texts:
        tokens = word_tokenize(text.lower())
        ngrams = get_ngrams(tokens, n)
        ngram_counter.update(ngrams)
        total += len(ngrams)
    # Normalize to probabilities
    dist = {k: v/total for k, v in ngram_counter.items()}
    return dist

# KL Divergence: P (original) vs Q (augmented)
def kl_divergence(P, Q, epsilon=1e-10):
    divergence = 0.0
    for ng, p_prob in P.items():
        q_prob = Q.get(ng, epsilon)  # smoothing for missing n-grams
        divergence += p_prob * math.log(p_prob / q_prob)
    return divergence

# Compute distributions and KL divergence for different n-grams
for n in [1, 2, 3]:
    P = get_ngram_distribution(original_texts, n=n)
    Q = get_ngram_distribution(augmented_texts, n=n)
    kl_score = kl_divergence(P, Q)
    print(f"{n}-gram KL Divergence (Original || Augmented): {kl_score:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\S.SHIKHAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


1-gram KL Divergence (Original || Augmented): 0.0269
2-gram KL Divergence (Original || Augmented): 0.1111
3-gram KL Divergence (Original || Augmented): 0.1884
