In [32]:
import pandas as pd
import re
import numpy as np
import spacy
#from scipy.interpolate import make_interp_spline
#from collections import Counter
from lexicalrichness import LexicalRichness

nlp = spacy.load('en_core_web_sm')

essays = pd.read_csv("data/essays-without-markers.csv", sep=";", encoding="UTF-8")
#essays

In [33]:
# Removes the titles from ChatGPT 4 essays
essays['ChatGPT-4'] = essays['ChatGPT-4'].str.replace(r'Title:.*\n\r\n', '', regex=True)

# Preprocesses data with spaCy for later use

essays["STUD_spacy"] = essays["Student"].apply(lambda x: nlp(x))
essays["STUD_lemma"] = essays["STUD_spacy"].apply(lambda x: " ".join([y.lemma_ for y in x]))

essays["GPT3_spacy"] = essays["ChatGPT-3"].apply(lambda x: nlp(x))
essays["GPT3_lemma"] = essays["GPT3_spacy"].apply(lambda x: " ".join([y.lemma_ for y in x]))

essays["GPT4_spacy"] = essays["ChatGPT-4"].apply(lambda x: nlp(x))
essays["GPT4_lemma"] = essays["GPT4_spacy"].apply(lambda x: " ".join([y.lemma_ for y in x]))

In [34]:
# Countes the number of sentences per essay

def num_of_sent(text):
    i = 0
    for sentence in text.sents:
        i += 1
    return i

essays["STUD_sent_count"] = essays["STUD_spacy"].apply(lambda x: num_of_sent(x))
essays["GPT3_sent_count"] = essays["GPT3_spacy"].apply(lambda x: num_of_sent(x))
essays["GPT4_sent_count"] = essays["GPT4_spacy"].apply(lambda x: num_of_sent(x))

In [35]:
# Countes the number of words per essay

def num_of_words(text):
    count = len(text.split())
    return count

essays["STUD_word_count"] = essays["Student"].apply(lambda x: num_of_words(x))
essays["GPT3_word_count"] = essays["ChatGPT-3"].apply(lambda x: num_of_words(x))
essays["GPT4_word_count"] = essays["ChatGPT-4"].apply(lambda x: num_of_words(x))       

### Sentence complexity

We have two score for sentence complexity. 

1. One based on a number of particular dependnecy labels found in each sentence (Clausal modifier of noun; Conjunct; Adverbial clause modifier; Clausal complement; Clausal subject; Discourse; Parataxis)
2. Second one is based on the depth of the dependency tree

The output values are mean values of the number of the tags per sentence and of the depth of the dependency tree of each sentence.

In [36]:
# Calcualtes the number of specified dependency label within a sentence
def calculate_dep_score(text):
    temp = []
    for sentence in nlp(text).sents:
        temp.append(sent_complexity_structure(sentence))
    return np.mean(temp)

# Return the number of specified dependency labels found
def sent_complexity_structure(doc):
    return len([token for token in doc if (token.dep_ == "acl" or token.dep_ == "conj" or token.dep_ == "advcl"or token.dep_ == "ccomp"
    or token.dep_ == "csubj" or token.dep_ == "discourse" or token.dep_ == "parataxis")])

# Calculates the dependency depth 
def calculate_dep_length(text):
    temp = []
    for sentence in nlp(text).sents:
        temp.append(walk_tree(sentence.root, 0))
    return np.mean(temp)  

# Walks the dependency tree and returns the depth
def walk_tree(node, depth):
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree(child, depth + 1) for child in node.children)
    else:
        return depth


essays["STUD_sent_complex_tags"] = essays["Student"].apply(lambda x: calculate_dep_score(x))
essays["STUD_sent_complex_depth"] = essays["Student"].apply(lambda x: calculate_dep_length(x))

essays["GPT3_sent_complex_tags"] = essays["ChatGPT-3"].apply(lambda x: calculate_dep_score(x))
essays["GPT3_sent_complex_depth"] = essays["ChatGPT-3"].apply(lambda x: calculate_dep_length(x))

essays["GPT4_sent_complex_tags"] = essays["ChatGPT-4"].apply(lambda x: calculate_dep_score(x))
essays["GPT4_sent_complex_depth"] = essays["ChatGPT-4"].apply(lambda x: calculate_dep_length(x))

### Lexical diversity

Calculating lexical diverstity score using MTLD measure.

In [37]:
# calculates MTLD score for the whole essay

def calculate_lex_richness_MTLD2(text):
    lex = LexicalRichness(text) 
    lex_rich_score = lex.mtld()
    return(lex_rich_score)

essays["STUD_LD"] = essays["Student"].apply(lambda x: calculate_lex_richness_MTLD2(x))
essays["GPT3_LD"] = essays["ChatGPT-3"].apply(lambda x: calculate_lex_richness_MTLD2(x))
essays["GPT4_LD"] = essays["ChatGPT-4"].apply(lambda x: calculate_lex_richness_MTLD2(x))

### Discourse markers

Calculating number of discourse markers from Penn Discourse Tree Bank per essay (some discourse markers (about, as, by, both, for, from, given, in, like, on, once, only, still, when, with, without, yet, and) we excluded from the list because they can often be used as not discourse markers).

In [38]:
# Counts the number of discourse markers using PDTB list

discourse = pd.read_csv("markers/connectives_discourse_markers_PDTB.txt", sep="\'", encoding="UTF-8", header=None, usecols = [1,3])

discourse[3] = discourse[3].apply(lambda x: x.replace("t_conn_", ""))
discourse[1] = discourse[1].apply(lambda x: " " + x + " ")
discourse.sort_values(3, inplace=True, ascending=False)

# Countes the total numbers of discourse markers per essay
def count_discourse_markers(text):
    i = 0
    for marker in discourse.itertuples():
        if marker[1] in text:
            i += text.count(marker[1])
    return i

essays["STUD_discourse"] = essays["STUD_lemma"].apply(lambda x: count_discourse_markers(x))
essays["GPT3_discourse"] = essays["GPT3_lemma"].apply(lambda x: count_discourse_markers(x))
essays["GPT4_discourse"] = essays["GPT4_lemma"].apply(lambda x: count_discourse_markers(x))

### Modals

Counting the number of modals using POS tag "MD" and the modals.csv.

In [39]:
# Counts the number of modals from the list of modals

modals = pd.read_csv("markers/modals.csv", sep=",", encoding="UTF-8", header=None)
modals[0] = modals[0].apply(lambda x: x.replace('_', ' '))

# Counts the number of modals per essay
def count_total_modals(text):
    counter = 0
    for modal in modals.itertuples():
        if modal[1] in text:
            counter += text.count(modal[1])
    return counter

essays["STUD_modals1"] = essays["STUD_lemma"].apply(lambda x: count_total_modals(x))
essays["GPT3_modals1"] = essays["GPT3_lemma"].apply(lambda x: count_total_modals(x))
essays["GPT4_modals1"] = essays["GPT4_lemma"].apply(lambda x: count_total_modals(x))

In [40]:
# Counts the number of modals using POS tagging

essays["STUD_pos"] = essays["STUD_spacy"].apply(lambda x: " ".join([y.tag_ for y in x]))
essays["GPT3_pos"] = essays["GPT3_spacy"].apply(lambda x: " ".join([y.tag_ for y in x]))
essays["GPT4_pos"] = essays["GPT4_spacy"].apply(lambda x: " ".join([y.tag_ for y in x]))

essays["STUD_modals2"] = essays["STUD_pos"].str.count(r'MD')
essays["GPT3_modals2"] = essays["GPT3_pos"].str.count(r'MD')
essays["GPT4_modals2"] = essays["GPT4_pos"].str.count(r'MD')

In [41]:
# Calculates total number of modals per essay

essays["STUD_modals_all"] = essays["STUD_modals2"] + essays["STUD_modals1"]
essays["GPT3_modals_all"] = essays["GPT3_modals2"] + essays["GPT3_modals1"]
essays["GPT4_modals_all"] = essays["GPT4_modals2"] + essays["GPT4_modals1"]

### Epistemic markers
 
Getting the number of epistemic markers.

In [42]:
# Counts the total number of epistemic markers per essay

def find_epistemic_markers(text):
    ep_markers = []
    ep_markers.extend(re.findall(r"(?:I|We|we|One|one)(?:\s\w+)?(?:\s\w+)?\s(?:believes?|thinks?|means?|worry|worries|know|guesse?s?|assumes?)\s(?:that)?", text))
    ep_markers.extend(re.findall(r"(?:It|it)\sis\s(?:believed|known|assumed|thought)\s(?:that)?", text))
    ep_markers.extend(re.findall(r"(?:I|We|we)\s(?:am|are)\s(?:thinking|guessing)\s(?:that)?", text))
    ep_markers.extend(re.findall(r"(?:I|We|we|One|one)(?:\s\w+)?\s(?:do|does)\snot\s(?:believe?|think|know)\s(?:that)?", text))
    ep_markers.extend(re.findall(r"(?:I|We|we|One|one)\swould(?:\s\w+)?(?:\snot)?\ssay\s(?:that)?", text))
    ep_markers.extend(re.findall(r"I\sam\s(?:afraid|sure|confident)\s(?:that)?", text))
    ep_markers.extend(re.findall(r"(?:My|my|Our|our)\s(?:experience|opinion|belief|knowledge|worry|worries|concerns?|guesse?s?)\s(?:is|are)\s(?:that)?", text))
    ep_markers.extend(re.findall(r"[In]n\s(?:my|our)(?:\s\w+)?\sopinion", text))
    ep_markers.extend(re.findall(r"As\sfar\sas\s(?:I|We|we)\s(?:am|are)\sconcerned", text))
    ep_markers.extend(re.findall(r"(?:I|We|we|One|one)\s(?:can|could|may|might)(?:\s\w+)?\sconclude\s(?:that)?", text))
    ep_markers.extend(re.findall(r"I\s(?:am\swilling\sto|must)\ssay\s(?:that)?", text))
    ep_markers.extend(re.findall(r"One\s(?:can|could|may|might)\ssay\s(?:that)?", text))
    ep_markers.extend(re.findall(r"[Oo]ne\s(?:can|could|may|might)\ssay\s(?:that)?", text))
    ep_markers.extend(re.findall(r"[Ii]t\sis\s(?:obvious|(?:un)?clear)", text))
    ep_markers.extend(re.findall(r"[Ii]t\s(?:seems|feels|looks)", text))
    return len(ep_markers)

essays["STUD_EpMarkers"] = essays["Student"].apply(lambda x: find_epistemic_markers(x))
essays["GPT3_EpMarkers"] = essays["ChatGPT-3"].apply(lambda x: find_epistemic_markers(x))
essays["GPT4_EpMarkers"] = essays["ChatGPT-4"].apply(lambda x: find_epistemic_markers(x))

### Nominalisations

Counting the number of nominalisations per essay. 

In [43]:
# Counts the total number of nominalisations per essay

def nominalisation_counter(text):
    suffixes_n = r'\b[A-Z]*\w+(?:tion|ment|ance|ence|ion|it(?:y|ies)|ness|ship)(?:s|es)?\b'
    
    nom_nouns = []    
    nouns = [token.text for token in text if token.pos_ == 'NOUN']  
    nom_nouns = [noun for noun in nouns if re.match(suffixes_n, noun)] 
    
    return(len(nom_nouns))
    
essays["STUD_nominalisation"] = essays["STUD_spacy"].apply(lambda x: nominalisation_counter(x))
essays["GPT3_nominalisation"] = essays["GPT3_spacy"].apply(lambda x: nominalisation_counter(x))
essays["GPT4_nominalisation"] = essays["GPT4_spacy"].apply(lambda x: nominalisation_counter(x))

In [44]:
# Counts the average number of features (discourse markers, modals, epistemic markers, nominalisations) per sentence for each essay

def average_per_sentence(feature, sent):
    average = feature/sent
    return(average)

essays["STUD_dm_per_sent"] = essays.apply(lambda row: average_per_sentence(row["STUD_discourse"], row["STUD_sent_count"]), axis=1)
essays["GPT3_dm_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT3_discourse"], row["GPT3_sent_count"]), axis=1)
essays["GPT4_dm_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT4_discourse"], row["GPT4_sent_count"]), axis=1)

essays["STUD_mod_per_sent"] = essays.apply(lambda row: average_per_sentence(row["STUD_modals_all"], row["STUD_sent_count"]), axis=1)
essays["GPT3_mod_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT3_modals_all"], row["GPT3_sent_count"]), axis=1)
essays["GPT4_mod_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT4_modals_all"], row["GPT4_sent_count"]), axis=1)

essays["STUD_ep_per_sent"] = essays.apply(lambda row: average_per_sentence(row["STUD_EpMarkers"], row["STUD_sent_count"]), axis=1)
essays["GPT3_ep_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT3_EpMarkers"], row["GPT3_sent_count"]), axis=1)
essays["GPT4_ep_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT4_EpMarkers"], row["GPT4_sent_count"]), axis=1)

essays["STUD_nom_per_sent"] = essays.apply(lambda row: average_per_sentence(row["STUD_nominalisation"], row["STUD_sent_count"]), axis=1)
essays["GPT3_nom_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT3_nominalisation"], row["GPT3_sent_count"]), axis=1)
essays["GPT4_nom_per_sent"] = essays.apply(lambda row: average_per_sentence(row["GPT4_nominalisation"], row["GPT4_sent_count"]), axis=1)

In [45]:
print("Sentence complexity based on a number of certain dependency tags")
print("Student:", np.mean(essays["STUD_sent_complex_tags"]))
print("GPT3: ", np.mean(essays["GPT3_sent_complex_tags"]))
print("GPT4: ", np.mean(essays["GPT4_sent_complex_tags"]), "\n")

print("Sentence complexity based on the tree depth")
print("Student:", np.mean(essays["STUD_sent_complex_depth"]))
print("GPT3: ", np.mean(essays["GPT3_sent_complex_depth"]))
print("GPT4: ", np.mean(essays["GPT4_sent_complex_depth"]), "\n")

print("MTLD lexical diversity score")
print("GPT4:", np.mean(essays["GPT4_LD"]))
print("GPT3: ", np.mean(essays["GPT3_LD"]))
print("Student: ", np.mean(essays["STUD_LD"]), "\n")

print("Average number of discourse markers per essay")
print("Student:", np.mean(essays["STUD_discourse"]))
print("GPT3: ", np.mean(essays["GPT3_discourse"]))
print("GPT4: ", np.mean(essays["GPT4_discourse"]), "\n")

print("Average number of modals (from the list) per essay")
print("Student:", np.mean(essays["STUD_modals1"]))
print("GPT3: ", np.mean(essays["GPT3_modals1"]))
print("GPT4: ", np.mean(essays["GPT4_modals1"]), "\n")

print("Average number of modals (POS-tags) per essay")
print("Student:", np.mean(essays["STUD_modals2"]))
print("GPT3: ", np.mean(essays["GPT3_modals2"]))
print("GPT4: ", np.mean(essays["GPT4_modals2"]), "\n")

print("Average number of epistemic markers per essay")
print("Student:", np.mean(essays["STUD_EpMarkers"]))
print("GPT3: ", np.mean(essays["GPT3_EpMarkers"]))
print("GPT4: ", np.mean(essays["GPT4_EpMarkers"]), "\n")

print("Average number of nominalisations per essay")
print("Student:", np.mean(essays["STUD_nominalisation"]))
print("GPT3: ", np.mean(essays["GPT3_nominalisation"]))
print("GPT4: ", np.mean(essays["GPT4_nominalisation"]))

Sentence complexity based on a number of certain dependency tags
Student: 1.8079994478858226
GPT3:  2.305861562185092
GPT4:  2.079634224707754 

Sentence complexity based on the tree depth
Student: 5.722130126604919
GPT3:  6.181905400336772
GPT4:  5.937846240542318 

MTLD lexical diversity score
GPT4: 108.90580635929804
GPT3:  75.67987401417784
Student:  95.72135147300236 

Average number of discourse markers per essay
Student: 9.666666666666666
GPT3:  6.277777777777778
GPT4:  4.722222222222222 

Average number of modals (from the list) per essay
Student: 2.1555555555555554
GPT3:  1.4555555555555555
GPT4:  1.6 

Average number of modals (POS-tags) per essay
Student: 8.688888888888888
GPT3:  7.511111111111111
GPT4:  4.522222222222222 

Average number of epistemic markers per essay
Student: 1.0222222222222221
GPT3:  0.2
GPT4:  0.0 

Average number of nominalisations per essay
Student: 16.91111111111111
GPT3:  18.955555555555556
GPT4:  22.355555555555554


In [46]:
essays.to_csv("data/essays-with-linguistic-markers.csv", index=False)