In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from datasets import Dataset, load_dataset
import torch
import pandas as pd
from tqdm import tqdm

In [None]:
device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [None]:
train_dataset = load_dataset("rajendrabaskota/hc3-wiki-intro-dataset", split="train")
test_dataset = load_dataset("rajendrabaskota/hc3-wiki-intro-dataset", split="test")

In [None]:
train_dataset, test_dataset

---
## Function to Calculate Perplexity Score of Given Text
---

In [None]:
def compute_perplexity_score(text):
    max_length = 64
    stride = 32
    # obtaining the encodings from text
    encodings = tokenizer(text, return_tensors="pt")
    seq_len = encodings.input_ids.size(1)

    # list for storing negative log likelihood of each window
    nlls = []
    prev_end_loc = 0
    
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    perplexity = torch.exp(torch.stack(nlls).mean()) # taking mean and performing exponentiation
    return float(perplexity.cpu().detach().numpy()) # returning perplexity score as a floating point number

---
## Calculate Mean Length
---

In [None]:
text_count = 0
length = 0
text_length_list = []
texts = train_dataset['text']
text_count = 0

for i in tqdm(range(len(texts))):
    current_text = texts[i]
    current_length = len(current_text.split())
    length += current_length
    text_length_list.append(current_length)
    text_count += 1
    
q3 = sorted(text_length_list)[int(0.75 * len(text_length_list))]
mean_length = length / text_count

print("Q3:", q3)
print("mean_length:", mean_length)

---
## Calculate Perplexity
---

In [None]:
min_index = 240_000
max_index = len(train_dataset['text'])


total = max_index - min_index
perplexity_score_list = []

train_dataset_texts = train_dataset['text']

for i in tqdm(range(total)):
    current_text = train_dataset_texts[min_index + i]
    perplexity = compute_perplexity_score(current_text)
    perplexity_score_list.append(perplexity)

In [None]:
len(perplexity_score_list)

In [None]:
test_perplexity_score_list = []

test_dataset_texts = test_dataset['text']

for i in tqdm(range(len(test_dataset_texts))):
    current_text = test_dataset_texts[i]
    perplexity = compute_perplexity_score(current_text)
    test_perplexity_score_list.append(perplexity)

In [None]:
len(test_perplexity_score_list)

In [None]:
# Adding perplexity score to test dataset
test_dataset = test_dataset.add_column("perplexity", test_perplexity_score_list)
print(test_dataset)

In [None]:
# !pip install -U datasets huggingface-hub
# !huggingface-cli login --token <your_token>

# # Pushing test dataset to hub
# test_dataset.push_to_hub("hc3-and-gpt-wiki-intro-with-perplexity", split="test")

---
## Save perplexity score and get download link
---

In [None]:
perplexity_score_dataframe = pd.DataFrame.from_dict({"perplexity": perplexity_score_list})
# perplexity_score_dataframe = pd.DataFrame.from_dict({"perplexity": test_perplexity_score_list})

In [None]:
perplexity_score_dataframe.head()

In [None]:
perplexity_score_dataframe.to_csv(f"perplexity_score_{min_index}-{max_index}.csv.gz", compression="gzip", index=False)
# perplexity_score_dataframe.to_csv(f"test_perplexity_score.csv.gz", compression="gzip", index=False)

In [None]:
from IPython.display import FileLink
FileLink(f"perplexity_score_{min_index}-{max_index}.csv.gz")
# FileLink(f"test_perplexity_score.csv.gz")

In [None]:
def application():
    THRESHOLD = 12.80
    text = str(input("\n\nEnter text to check:\n\n"))
    ppl_score = compute_perplexity_score(text)

    print("\n" + "-" * 50)
    print(">   Perplexity Score:", ppl_score)
    print(">   Threshold: ", THRESHOLD)

    if(ppl_score >= THRESHOLD):
        print(">   The given text is HUMAN written.")
    else:
        print(">   The given text is AI generated.")

    print("-" * 50)
    print("\n\n\n\n\n")
    
    
    

---
## Perplexity Score Method
---

In [None]:
application()