In [1]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE
import pandas as pd
# silence warnings
import warnings
warnings.filterwarnings("ignore")

# OJS Education

In [2]:
name = "ojs_ed"

Training data: OJS 2021 monolingual English Education research article abstracts (n = 27,010)<br><br>
Validation data: OJS Jan - Nov 2022 English Education abstracts<br>
    Monolingual: 26,094 abstracts, 196,928 sentences<br>
    Translated: 17,385 abstracts, 119,340 sentences

In [3]:
human_data_21 = pd.read_parquet(f"data/training_data/{name}/human_data_21.parquet")
print(f"human_data_21: {human_data_21.shape[0]} sentences")
ai_data_21 = pd.read_parquet(f"data/training_data/{name}/ai_data_21.parquet")
print(f"ai_data_21: {ai_data_21.shape[0]} sentences")

human_data_21: 212521 sentences
ai_data_21: 451240 sentences


In [4]:
# call function estimate_text_distribution to get the AI content distribution & human content distribution
estimate_text_distribution(f"data/training_data/{name}/human_data_21.parquet",f"data/training_data/{name}/ai_data_21.parquet",f"distribution/{name}_21.parquet")
# load the word occurrences frequency into our framework
model=MLE(f"distribution/{name}_21.parquet")
# validate our method using mixed corpus with known ground truth alpha
# this is the guaranteed human-written evaluation set
print(f"HUMAN EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")
# this is the possibly google translated evaluation set
print(f"HUMAN + GOOGLE TRANSLATE (?) EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/ojs_ed/translated/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")

HUMAN EVALUATION SET
Ground Truth,Prediction,        CI,     Error
     0.000,     0.059,     0.001,     0.059
Ground Truth,Prediction,        CI,     Error
     0.025,     0.086,     0.001,     0.061
Ground Truth,Prediction,        CI,     Error
     0.050,     0.109,     0.002,     0.059
Ground Truth,Prediction,        CI,     Error
     0.075,     0.133,     0.002,     0.058
Ground Truth,Prediction,        CI,     Error
     0.100,     0.155,     0.002,     0.055
Ground Truth,Prediction,        CI,     Error
     0.125,     0.177,     0.002,     0.052
Ground Truth,Prediction,        CI,     Error
     0.150,     0.199,     0.002,     0.049
Ground Truth,Prediction,        CI,     Error
     0.175,     0.220,     0.002,     0.045
Ground Truth,Prediction,        CI,     Error
     0.200,     0.242,     0.002,     0.042
Ground Truth,Prediction,        CI,     Error
     0.225,     0.263,     0.002,     0.038
Ground Truth,Prediction,        CI,     Error
     0.250,     0.284,     0.003,

Training data: OJS 2021 monolingual English Education research article abstracts + possibly google translated abstracts (n = 48,391)<br><br>
Validation data: OJS Jan - Nov 2022 English Education abstracts<br>
    Monolingual: 26,094 abstracts, 196,928 sentences<br>
    Translated: 17,385 abstracts, 119,340 sentences

In [5]:
human_data_21 = pd.read_parquet(f"data/training_data/{name}/human_data_21.parquet")
human_data_21_translated = pd.read_parquet(f"data/training_data/{name}/human_data_translated_21.parquet")
ai_data_21 = pd.read_parquet(f"data/training_data/{name}/ai_data_21.parquet")
ai_data_21_translated = pd.read_parquet(f"data/training_data/{name}/ai_data_translated_21.parquet")
# stack the data
human_data = pd.concat([human_data_21, human_data_21_translated])
print(f"human_data: {human_data.shape[0]} sentences")
ai_data = pd.concat([ai_data_21, ai_data_21_translated])
print(f"ai_data: {ai_data.shape[0]} sentences")
# save the data to new parquet files
human_data.to_parquet(f"data/training_data/{name}/human_data_plus_translated_21.parquet")
ai_data.to_parquet(f"data/training_data/{name}/ai_data_plus_translated_21.parquet")

human_data: 354945 sentences
ai_data: 697274 sentences


In [7]:
# call function estimate_text_distribution to get the AI content distribution & human content distribution
estimate_text_distribution(f"data/training_data/{name}/human_data_plus_translated_21.parquet",f"data/training_data/{name}/ai_data_plus_translated_21.parquet",f"distribution/{name}_plus_translated_21.parquet")
# load the word occurrences frequency into our framework
model=MLE(f"distribution/{name}_plus_translated_21.parquet")
# validate our method using mixed corpus with known ground truth alpha
# this is the guaranteed human-written evaluation set
print(f"HUMAN EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")
# this is the possibly google translated evaluation set
print(f"HUMAN + GOOGLE TRANSLATE (?) EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/ojs_ed/translated/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")

HUMAN EVALUATION SET
Ground Truth,Prediction,        CI,     Error
     0.000,     0.080,     0.002,     0.080
Ground Truth,Prediction,        CI,     Error
     0.025,     0.110,     0.002,     0.085
Ground Truth,Prediction,        CI,     Error
     0.050,     0.138,     0.002,     0.088
Ground Truth,Prediction,        CI,     Error
     0.075,     0.164,     0.002,     0.089
Ground Truth,Prediction,        CI,     Error
     0.100,     0.188,     0.002,     0.088
Ground Truth,Prediction,        CI,     Error
     0.125,     0.213,     0.002,     0.088
Ground Truth,Prediction,        CI,     Error
     0.150,     0.238,     0.002,     0.088
Ground Truth,Prediction,        CI,     Error
     0.175,     0.260,     0.002,     0.085
Ground Truth,Prediction,        CI,     Error
     0.200,     0.285,     0.002,     0.085
Ground Truth,Prediction,        CI,     Error
     0.225,     0.308,     0.003,     0.083
Ground Truth,Prediction,        CI,     Error
     0.250,     0.331,     0.003,

Training data: OJS 2021 + 2020 monolingual English Education research article abstracts (n = 46,128)<br><br>
Validation data: OJS Jan - Nov 2022 English Education abstracts<br>
    Monolingual: 26,094 abstracts, 196,928 sentences<br>
    Translated: 17,385 abstracts, 119,340 sentences

In [8]:
human_data_21 = pd.read_parquet(f"data/training_data/{name}/human_data_21.parquet")
human_data_20 = pd.read_parquet(f"data/training_data/{name}/human_data_20.parquet")
ai_data_21 = pd.read_parquet(f"data/training_data/{name}/ai_data_21.parquet")
ai_data_20 = pd.read_parquet(f"data/training_data/{name}/ai_data_20.parquet")
# stack the data
human_data = pd.concat([human_data_21,human_data_20])
print(f"human_data: {human_data.shape[0]} sentences")
ai_data = pd.concat([ai_data_21,ai_data_20])
print(f"ai_data: {ai_data.shape[0]} sentences")
# save the data to new parquet files
human_data.to_parquet(f"data/training_data/{name}/human_data_21_20.parquet")
ai_data.to_parquet(f"data/training_data/{name}/ai_data_21_20.parquet")

human_data: 365359 sentences
ai_data: 771580 sentences


In [9]:
# call function estimate_text_distribution to get the AI content distribution & human content distribution
estimate_text_distribution(f"data/training_data/{name}/human_data_21_20.parquet",f"data/training_data/{name}/ai_data_21_20.parquet",f"distribution/{name}_21_20.parquet")
# load the word occurrences frequency into our framework
model=MLE(f"distribution/{name}_21_20.parquet")
# validate our method using mixed corpus with known ground truth alpha
# this is the guaranteed human-written evaluation set
print(f"HUMAN EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")
# this is the possibly google translated evaluation set
print(f"HUMAN + GOOGLE TRANSLATE (?) EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/ojs_ed/translated/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print( "+---------------------------------+")

HUMAN EVALUATION SET
Ground Truth,Prediction,        CI,     Error
     0.000,     0.059,     0.001,     0.059
Ground Truth,Prediction,        CI,     Error
     0.025,     0.085,     0.001,     0.060
Ground Truth,Prediction,        CI,     Error
     0.050,     0.109,     0.002,     0.059
Ground Truth,Prediction,        CI,     Error
     0.075,     0.133,     0.002,     0.058
Ground Truth,Prediction,        CI,     Error
     0.100,     0.155,     0.002,     0.055
Ground Truth,Prediction,        CI,     Error
     0.125,     0.178,     0.002,     0.053
Ground Truth,Prediction,        CI,     Error
     0.150,     0.200,     0.002,     0.050
Ground Truth,Prediction,        CI,     Error
     0.175,     0.221,     0.002,     0.046
Ground Truth,Prediction,        CI,     Error
     0.200,     0.244,     0.002,     0.044


Training data: OJS 2021 + 2020 monolingual English Education research article abstracts + translated abstracts (n = 83,992)<br><br>
Validation data: OJS Jan - Nov 2022 English Education abstracts<br>
    Monolingual: 26,094 abstracts, 196,928 sentences<br>
    Translated: 17,385 abstracts, 119,340 sentences

In [None]:
human_data_21 = pd.read_parquet(f"data/training_data/{name}/human_data_21.parquet")
human_data_translated_21 = pd.read_parquet(f"data/training_data/{name}/human_data_translated_21.parquet")
human_data_20 = pd.read_parquet(f"data/training_data/{name}/human_data_20.parquet")
human_data_translated_20 = pd.read_parquet(f"data/training_data/{name}/human_data_translated_20.parquet")
ai_data_21 = pd.read_parquet(f"data/training_data/{name}/ai_data_21.parquet")
ai_data_translated_21 = pd.read_parquet(f"data/training_data/ai_data_translated_21.parquet")
ai_data_20 = pd.read_parquet(f"data/training_data/{name}/ai_data_20.parquet")
# stack the data
human_data = pd.concat([human_data_21, human_data_translated_21, human_data_20, human_data_translated_20])
print(f"human_data: {human_data.shape[0]} sentences")
ai_data = pd.concat([ai_data_21, ai_data_translated_21, ai_data_20])
print(f"ai_data: {ai_data.shape[0]} sentences")
# save the data to new parquet files
human_data.to_parquet(f"data/training_data/{name}/human_data_translated_21_20.parquet")
ai_data.to_parquet(f"data/training_data/{name}/ai_data_translated_21_20.parquet")

In [None]:
# call function estimate_text_distribution to get the AI content distribution & human content distribution
estimate_text_distribution(f"data/training_data/{name}/human_data_translated_21_20.parquet",f"data/training_data/{name}/ai_data_translated_21_20.parquet",f"distribution/{name}_translated_21_20.parquet")
# load the word occurrences frequency into our framework
model=MLE(f"distribution/{name}_translated_21_20.parquet")
# validate our method using mixed corpus with known ground truth alpha
# this is the guaranteed human-written evaluation set
print(f"HUMAN EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")
# this is the possibly google translated evaluation set
print(f"HUMAN + GOOGLE TRANSLATE (?) EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/ojs_ed/translated/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print( "+---------------------------------+")

Training data: OJS 2021 possibly translated abstracts (n = 21,381)<br><br>
Validation data: OJS Jan - Nov 2022 English Education abstracts<br>
    Monolingual: 26,094 abstracts, 196,928 sentences<br>
    Translated: 17,385 abstracts, 119,340 sentences

In [None]:
human_data_translated_21 = pd.read_parquet(f"data/training_data/{name}/human_data_translated_21.parquet")
print(f"human_data_translated_21: {human_data_translated_21.shape[0]} sentences")
ai_data_translated_21 = pd.read_parquet(f"data/training_data/{name}/ai_data_translated_21.parquet")
print(f"ai_data_translated_21: {ai_data_translated_21.shape[0]} sentences")

In [None]:
# call function estimate_text_distribution to get the AI content distribution & human content distribution
estimate_text_distribution(f"data/training_data/{name}/human_data_translated_21.parquet",f"data/training_data/{name}/ai_data_translated_21.parquet",f"distribution/{name}_translated_21.parquet")
# load the word occurrences frequency into our framework
model=MLE(f"distribution/{name}_translated_21.parquet")
# validate our method using mixed corpus with known ground truth alpha
# this is the guaranteed human-written evaluation set
print(f"HUMAN EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/{name}/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print("+---------------------------------+")
# this is the possibly google translated evaluation set
print(f"HUMAN + GOOGLE TRANSLATE (?) EVALUATION SET")
for alpha in [0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25]:
    estimated,ci=model.inference(f"data/validation_data/ojs_ed/translated/ground_truth_alpha_{alpha}.parquet")
    error=abs(estimated-alpha)
    print(f"{'Ground Truth':>10},{'Prediction':>10},{'CI':>10},{'Error':>10}")
    print(f"{alpha:10.3f},{estimated:10.3f},{ci:10.3f},{error:10.3f}")
print( "+---------------------------------+")