In [None]:
from lexicalrichness import LexicalRichness
import matplotlib.pyplot as plt
from scipy.stats import entropy
from collections import Counter
import pandas as pd
import numpy as np
import tiktoken
import spacy
import re

gpt_tokenizer = tiktoken.encoding_for_model("gpt-4o")
question_df = pd.read_csv("../data/OAI/Questions/all_questions.csv")
all_questions = question_df.Question

# Question Length

In [None]:
toks = [len(gpt_tokenizer.encode(ques)) for ques in all_questions]
print(f"Average question tokens: {np.mean(toks)}")

n, bins, patches = plt.hist(toks)
norm = plt.Normalize(min(n), max(n))
for i, patch in enumerate(patches):
    patch.set_facecolor(plt.cm.viridis(norm(n[i])))
    plt.text(patch.get_x() + patch.get_width() / 2, patch.get_height(), f"{int(n[i])}", ha='center', va='bottom')

plt.xlabel('Tokens')
plt.ylabel('Number of Samples')
plt.xticks(bins)
plt.savefig("../figures/for paper/q_len_dist.pdf")
plt.show()

# Text-to-Token Ratio

In [None]:
ttrs = [LexicalRichness(ques).ttr for ques in all_questions]
print(f"Average TTR: {np.mean(ttrs)}")

# Entropy

In [None]:
def shannon_entropy(token_list, base=2):
    values, counts = np.unique(token_list, return_counts=True)
    probs = counts / counts.sum()
    return entropy(probs, base=base)

batch_encoded_questions = gpt_tokenizer.encode_ordinary_batch(all_questions)
avg_entropy = np.round(np.mean([shannon_entropy(tok_list) for tok_list in batch_encoded_questions]), 2)
print(f"Average Entropy: {avg_entropy}")

max_entropy = np.log2(len(set([len(x) for x in batch_encoded_questions])))
print(f"Maximum possibe entropy for the questions: {np.round(max_entropy.item(),2)}")

# Dependency Parse Tree

In [None]:
# Run on Colab since it involves a transformer

nlp = spacy.load("en_core_web_trf")

def dependency_tree_depth(root):
    """Calculate the depth of a dependency parse tree starting from the root."""
    if not list(root.children):
        return 1
    return 1 + max(dependency_tree_depth(child) for child in root.children)

depths = []
for question in tqdm(all_questions):
  doc = nlp(question)
  root = [token for token in doc if token.head == token][0]
  depths.append(dependency_tree_depth(root))

print(f"Average dependency tree depth: {np.round(np.mean(depths), 2)}") #10.86

# Question Type

In [None]:
which = []
what = []
identify = []
for question in all_questions:
    if re.search("which", question, re.IGNORECASE):
        which.append(question)
    elif re.search("what", question, re.IGNORECASE):
        what.append(question)
    elif re.search("identify", question, re.IGNORECASE):
        identify.append(question)

print(f"Number of 'Which' questions: {len(which)}\nNumber of 'What' questions: {len(what)}\nNumber of 'Identify' questions: {len(identify)}")

In [None]:
# Checking for entity leak
c=0
for row in question_df.itertuples():
    all_ents = row.Entities.split("-")
    if len(all_ents) > 4:
        all_ents = all_ents[:3] + ["-".join(all_ents[3:])]
    for ent in all_ents:
        if re.search(rf"\b{ent}\b", row.Question, re.IGNORECASE):
            c+=1
            break
c

In [None]:
# Checking for SMILES leak
for row in question_df.itertuples():
    if "=" in row.Question:
        print(row.Question)