In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import math
from tqdm import tqdm

# Load augmented dataset
augmented = pd.read_csv("random_augmented_balanced_dataset.csv")
texts = augmented['text'].dropna().tolist()

# Load GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()
if torch.cuda.is_available():
    model = model.to('cuda')

# Perplexity calculation function
def calculate_perplexity(text):
    encodings = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    input_ids = encodings.input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to('cuda')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        neg_log_likelihood = outputs.loss.item() * input_ids.size(1)
    return math.exp(neg_log_likelihood / input_ids.size(1))

# Compute perplexity for each text
perplexities = []
print("Calculating perplexities...")
for text in tqdm(texts):
    try:
        ppl = calculate_perplexity(text)
    except Exception as e:
        ppl = float('inf')  # In case of error
    perplexities.append(ppl)

# Add perplexity column
augmented['perplexity'] = perplexities

# Define threshold (you can change it)
threshold = 600

# Filter low-perplexity rows
filtered = augmented[augmented['perplexity'] <= threshold]

# Save filtered dataset
filtered.to_csv("augmented_filtered_by_perplexity.csv", index=False)

print(f"Filtered {len(augmented) - len(filtered)} rows. Kept {len(filtered)} samples.")


Calculating perplexities...


  0%|          | 0/15074 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 15074/15074 [01:42<00:00, 146.61it/s]

Filtered 1389 rows. Kept 13685 samples.



