# Import Libraries

In [1]:
from tqdm import tqdm
import pandas as pd
import nltk
from nltk import word_tokenize
import string

from utils import load_sessions, read_session
from main import generate_buffer
from events import generate_event_seq
from summary import stats

# Compute summary statistics

In [2]:
sessions = load_sessions()
# sessions = load_sessions()[:10]

file_name = []
text = []
sentence_metrics_list = []
api_metrics_list = []

err = []

for sess in tqdm(sessions):
    events = read_session(sess, verbose=0)
    try:
        text_buffer = generate_buffer(events)
    except:
        err.append(str(sess.split('/')[-1]) + " is throwing an error!")
        continue
    file_name.append(sess.split('/')[-1])
    text.append(text_buffer[-1])
    event_seq_dict = generate_event_seq(buffer=text_buffer,
                                        events=events)
    sentence_metrics, api_metrics = stats(event_seq_dict)
    sentence_metrics_list.append(sentence_metrics)
    api_metrics_list.append(api_metrics)
    
for e in err:
    print(e)
    
df = pd.DataFrame()

df["file_name"] = file_name
df["text"] = text

for col in sentence_metrics_list[0]:
    df[str(col)] = [x[col] for x in sentence_metrics_list]
    
for col in api_metrics_list[0]:
    df[str(col)] = [x[col] for x in api_metrics_list]

Successfully downloaded 1447 writing sessions in CoAuthor!


  if ("gpt3-call" not in seq) and ("prompt" not in seq) and ("user" in seq):
  if ("gpt3-call" in seq) and ("user" not in seq):
  if "prompt" in temp_dict["sequence"][idx]:
  if ("gpt3-call" not in seq) and ("prompt" not in seq) and ("user" in seq):
  if ("gpt3-call" in seq) and ("user" not in seq):
  if "prompt" in temp_dict["sequence"][idx]:
  if ("gpt3-call" not in seq) and ("prompt" not in seq) and ("user" in seq):
  if ("gpt3-call" in seq) and ("user" not in seq):
  if "prompt" in temp_dict["sequence"][idx]:
  if ("gpt3-call" not in seq) and ("prompt" not in seq) and ("user" in seq):
  if ("gpt3-call" in seq) and ("user" not in seq):
  if "prompt" in temp_dict["sequence"][idx]:
  if ("gpt3-call" not in seq) and ("prompt" not in seq) and ("user" in seq):
  if ("gpt3-call" in seq) and ("user" not in seq):
  if ("gpt3-call" not in seq) and ("prompt" not in seq) and ("user" in seq):
  if ("gpt3-call" in seq) and ("user" not in seq):
100%|█████████████████████████████| 1447/1447 [21:59

312e3263a9f24f3184364949a42a6dfc.jsonl is throwing an error!





# Ratios

In [3]:
def get_ratio(num1, num2):
    return float(num1 / num2)


def add(num1, num2):
    return num1 + num2

In [4]:
# GPT-3 : Total Sentences

df["GPT-3 : Total Sentences"] = list(map(get_ratio, 
    df["Number of sentences completely authored by GPT-3"], 
    df["Total number of sentences"]
))

df["GPT-3 : Total Sentences"]

0       0.000000
1       0.000000
2       0.000000
3       0.031250
4       0.000000
          ...   
1441    0.000000
1442    0.000000
1443    0.371429
1444    0.000000
1445    0.117647
Name: GPT-3 : Total Sentences, Length: 1446, dtype: float64

In [5]:
# User : Total Sentences

df["User : Total Sentences"] = list(map(get_ratio, 
    df["Number of sentences completely authored by the user"], 
    df["Total number of sentences"]
))

df["User : Total Sentences"]

0       0.400000
1       0.500000
2       0.818182
3       0.562500
4       0.648649
          ...   
1441    0.534483
1442    0.687500
1443    0.085714
1444    0.666667
1445    0.411765
Name: User : Total Sentences, Length: 1446, dtype: float64

In [6]:
# Amount of usage of GPT-3 (SD+SE/SA)

df["Amount of GTP-3 Usage"] = list(map(get_ratio, 
    pd.Series(list(map(add, df["Number of sentences authored by GPT-3 and user"], 
                       df["Number of sentences completely authored by GPT-3"]))), 
    df["Total number of sentences"]
))

df["Amount of GTP-3 Usage"]

0       0.333333
1       0.200000
2       0.136364
3       0.375000
4       0.270270
          ...   
1441    0.396552
1442    0.062500
1443    0.742857
1444    0.166667
1445    0.235294
Name: Amount of GTP-3 Usage, Length: 1446, dtype: float64

In [7]:
# Type Token Ratio

def get_ttr(text):
    sentence_tokens = word_tokenize(text)
    punctuations = list(string.punctuation)
    sentence_tokens_clean = [word for word in sentence_tokens if word not in punctuations]
    ttr = len(set(sentence_tokens_clean)) / len(sentence_tokens_clean)
    return ttr


df["Type Token Ratio"] = df["text"].apply(get_ttr)
df["Type Token Ratio"]

0       0.564706
1       0.613559
2       0.548476
3       0.469974
4       0.491135
          ...   
1441    0.362854
1442    0.629758
1443    0.457584
1444    0.538710
1445    0.483871
Name: Type Token Ratio, Length: 1446, dtype: float64

# Mean

In [8]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Mean of", col, ":", np.mean(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Mean of", col, ":", np.mean(df[col]))
    
print("\nRatios")
print("Mean of GPT-3 / Total Sentences : ", np.mean(df["GPT-3 : Total Sentences"]))
print("Mean of User / Total Sentences : ", np.mean(df["User : Total Sentences"]))
print("Mean of Type Token Ratio : ", np.mean(df["Type Token Ratio"]))
print("Mean of Amount of GTP-3 Usage : ", np.mean(df["Amount of GTP-3 Usage"]))

Sentence Metrics
Mean of Total number of sentences : 28.96265560165975
Mean of Number of sentences of initial prompt : 4.421161825726141
Mean of Number of sentences completely authored by the user : 16.24273858921162
Mean of Number of sentences completely authored by GPT-3 : 0.6853388658367912
Mean of Number of sentences authored by GPT-3 and user : 7.6134163208852

API Metrics
Mean of Total number of GPT-3 calls made : 12.531120331950207
Mean of Number of times GPT-3 suggestion is used : 8.857538035961273
Mean of Number of times user rejected GPT-3 suggestion : 3.673582295988935
Mean of Number of times GPT-3 suggestion is modified : 1.586445366528354
Mean of Number of times GPT-3 suggestion is used as is : 7.271092669432918

Ratios
Mean of GPT-3 / Total Sentences :  0.021900159513948934
Mean of User / Total Sentences :  0.5412613590031221
Mean of Type Token Ratio :  0.48183805678712555
Mean of Amount of GTP-3 Usage :  0.2848898237822877


# Median

In [9]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Median of", col, ":", np.median(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Median of", col, ":", np.median(df[col]))

print("\nRatios")
print("Median of GPT-3 / Total Sentences : ", np.median(df["GPT-3 : Total Sentences"]))
print("Median of User / Total Sentences : ", np.median(df["User : Total Sentences"]))
print("Median of Type Token Ratio : ", np.median(df["Type Token Ratio"]))
print("Median of Amount of GTP-3 Usage : ", np.median(df["Amount of GTP-3 Usage"]))

Sentence Metrics
Median of Total number of sentences : 27.0
Median of Number of sentences of initial prompt : 4.0
Median of Number of sentences completely authored by the user : 15.0
Median of Number of sentences completely authored by GPT-3 : 0.0
Median of Number of sentences authored by GPT-3 and user : 6.0

API Metrics
Median of Total number of GPT-3 calls made : 10.0
Median of Number of times GPT-3 suggestion is used : 7.0
Median of Number of times user rejected GPT-3 suggestion : 3.0
Median of Number of times GPT-3 suggestion is modified : 1.0
Median of Number of times GPT-3 suggestion is used as is : 5.0

Ratios
Median of GPT-3 / Total Sentences :  0.0
Median of User / Total Sentences :  0.5641025641025641
Median of Type Token Ratio :  0.4821182605273514
Median of Amount of GTP-3 Usage :  0.25


# Standard Deviation

In [10]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Standard Deviation of", col, ":", np.std(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Standard Deviation of", col, ":", np.std(df[col]))
    
print("\nRatios")
print("Standard Deviation of GPT-3 / Total Sentences : ", np.std(df["GPT-3 : Total Sentences"]))
print("Standard Deviation of User / Total Sentences : ", np.std(df["User : Total Sentences"]))
print("Standard Deviation of Type Token Ratio : ", np.std(df["Type Token Ratio"]))
print("Standard Deviation of Amount of GTP-3 Usage : ", np.std(df["Amount of GTP-3 Usage"]))

Sentence Metrics
Standard Deviation of Total number of sentences : 10.388909909258523
Standard Deviation of Number of sentences of initial prompt : 2.3909859029112486
Standard Deviation of Number of sentences completely authored by the user : 9.535179050568926
Standard Deviation of Number of sentences completely authored by GPT-3 : 1.8864423445191325
Standard Deviation of Number of sentences authored by GPT-3 and user : 5.953072577616293

API Metrics
Standard Deviation of Total number of GPT-3 calls made : 9.204158194377401
Standard Deviation of Number of times GPT-3 suggestion is used : 7.424057788661343
Standard Deviation of Number of times user rejected GPT-3 suggestion : 3.530339833311101
Standard Deviation of Number of times GPT-3 suggestion is modified : 1.796857239727531
Standard Deviation of Number of times GPT-3 suggestion is used as is : 7.233591709071116

Ratios
Standard Deviation of GPT-3 / Total Sentences :  0.05414721819670053
Standard Deviation of User / Total Sentences 

# Minimum

In [11]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Minimum of", col, ":", np.min(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Minimum of", col, ":", np.min(df[col]))
    
print("\nRatios")
print("Minimum of GPT-3 / Total Sentences : ", np.min(df["GPT-3 : Total Sentences"]))
print("Minimum of User / Total Sentences : ", np.min(df["User : Total Sentences"]))
print("Minimum of Type Token Ratio : ", np.min(df["Type Token Ratio"]))
print("Minimum of Amount of GTP-3 Usage : ", np.min(df["Amount of GTP-3 Usage"]))

Sentence Metrics
Minimum of Total number of sentences : 11
Minimum of Number of sentences of initial prompt : 0
Minimum of Number of sentences completely authored by the user : 0
Minimum of Number of sentences completely authored by GPT-3 : 0
Minimum of Number of sentences authored by GPT-3 and user : 0

API Metrics
Minimum of Total number of GPT-3 calls made : 0
Minimum of Number of times GPT-3 suggestion is used : 0
Minimum of Number of times user rejected GPT-3 suggestion : 0
Minimum of Number of times GPT-3 suggestion is modified : 0
Minimum of Number of times GPT-3 suggestion is used as is : 0

Ratios
Minimum of GPT-3 / Total Sentences :  0.0
Minimum of User / Total Sentences :  0.0
Minimum of Type Token Ratio :  0.28794178794178793
Minimum of Amount of GTP-3 Usage :  0.0


# Maximum

In [12]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Maximum of", col, ":", np.max(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Maximum of", col, ":", np.max(df[col]))
    
print("\nRatios")
print("Maximum of GPT-3 / Total Sentences : ", np.max(df["GPT-3 : Total Sentences"]))
print("Maximum of User / Total Sentences : ", np.max(df["User : Total Sentences"]))
print("Maximum of Type Token Ratio : ", np.max(df["Type Token Ratio"]))
print("Maximum of Amount of GTP-3 Usage : ", np.max(df["Amount of GTP-3 Usage"]))

Sentence Metrics
Maximum of Total number of sentences : 78
Maximum of Number of sentences of initial prompt : 9
Maximum of Number of sentences completely authored by the user : 64
Maximum of Number of sentences completely authored by GPT-3 : 22
Maximum of Number of sentences authored by GPT-3 and user : 42

API Metrics
Maximum of Total number of GPT-3 calls made : 65
Maximum of Number of times GPT-3 suggestion is used : 47
Maximum of Number of times user rejected GPT-3 suggestion : 24
Maximum of Number of times GPT-3 suggestion is modified : 10
Maximum of Number of times GPT-3 suggestion is used as is : 47

Ratios
Maximum of GPT-3 / Total Sentences :  0.6111111111111112
Maximum of User / Total Sentences :  0.9629629629629629
Maximum of Type Token Ratio :  0.6879432624113475
Maximum of Amount of GTP-3 Usage :  0.8666666666666667


# Correlation

In [13]:
df.corr()

  df.corr()


Unnamed: 0,Total number of sentences,Number of sentences of initial prompt,Number of sentences completely authored by the user,Number of sentences completely authored by GPT-3,Number of sentences authored by GPT-3 and user,Total number of GPT-3 calls made,Number of times GPT-3 suggestion is used,Number of times user rejected GPT-3 suggestion,Number of times GPT-3 suggestion is modified,Number of times GPT-3 suggestion is used as is,GPT-3 : Total Sentences,User : Total Sentences,Amount of GTP-3 Usage,Type Token Ratio
Total number of sentences,1.0,-0.129523,0.773865,0.218429,0.488421,0.3651,0.454037,-0.002934,0.05189,0.453102,0.090754,0.265854,0.025049,-0.527044
Number of sentences of initial prompt,-0.129523,1.0,-0.300117,-0.062613,-0.127129,-0.067734,-0.096746,0.026855,-0.056362,-0.085293,-0.060789,-0.377381,-0.11684,0.034719
Number of sentences completely authored by the user,0.773865,-0.300117,1.0,-0.083143,-0.104341,-0.218031,-0.167396,-0.21642,-0.100297,-0.14689,-0.181537,0.779995,-0.521486,-0.291106
Number of sentences completely authored by GPT-3,0.218429,-0.062613,-0.083143,1.0,0.222622,0.422258,0.451338,0.151763,0.199498,0.413665,0.945054,-0.281658,0.403351,-0.171809
Number of sentences authored by GPT-3 and user,0.488421,-0.127129,-0.104341,0.222622,1.0,0.879771,0.956312,0.282646,0.210622,0.929173,0.174091,-0.544562,0.798102,-0.412992
Total number of GPT-3 calls made,0.3651,-0.067734,-0.218031,0.422258,0.879771,1.0,0.93199,0.647249,0.192459,0.908722,0.395012,-0.610905,0.806284,-0.358701
Number of times GPT-3 suggestion is used,0.454037,-0.096746,-0.167396,0.451338,0.956312,0.93199,1.0,0.326916,0.225656,0.970277,0.407231,-0.609296,0.847576,-0.404455
Number of times user rejected GPT-3 suggestion,-0.002934,0.026855,-0.21642,0.151763,0.282646,0.647249,0.326916,1.0,0.027233,0.328759,0.17348,-0.311419,0.319716,-0.084651
Number of times GPT-3 suggestion is modified,0.05189,-0.056362,-0.100297,0.199498,0.210622,0.192459,0.225656,0.027233,1.0,-0.016807,0.184228,-0.200982,0.269933,-0.053334
Number of times GPT-3 suggestion is used as is,0.453102,-0.085293,-0.14689,0.413665,0.929173,0.908722,0.970277,0.328759,-0.016807,1.0,0.372191,-0.575415,0.802841,-0.401856


# View DataFrame

In [14]:
df.head()

Unnamed: 0,file_name,text,Total number of sentences,Number of sentences of initial prompt,Number of sentences completely authored by the user,Number of sentences completely authored by GPT-3,Number of sentences authored by GPT-3 and user,Total number of GPT-3 calls made,Number of times GPT-3 suggestion is used,Number of times user rejected GPT-3 suggestion,Number of times GPT-3 suggestion is modified,Number of times GPT-3 suggestion is used as is,GPT-3 : Total Sentences,User : Total Sentences,Amount of GTP-3 Usage,Type Token Ratio
0,8c11358444974bf0b5224183acd8149d.jsonl,What Stereotypical Characters Make You Cringe?...,15,4,6,0,5,5,5,0,3,2,0.0,0.4,0.333333,0.564706
1,c7dc5563ed07478f9284190b6085f4d3.jsonl,How Worried Should We Be About Screen Time Dur...,20,6,10,0,4,6,4,2,3,1,0.0,0.5,0.2,0.613559
2,05a000131fc642f7bb20b62bb20a326e.jsonl,"All of the ""#1 Dad"" mugs in the world change t...",22,1,18,0,3,6,3,3,3,0,0.0,0.818182,0.136364,0.548476
3,7834dec912b34643afb92b7c3648a3fe.jsonl,"When you die, you appear in a cinema with a nu...",32,2,18,1,11,12,12,0,0,12,0.03125,0.5625,0.375,0.469974
4,105bf88bb4bc42688e06a54644e2989b.jsonl,"When you're 28, science discovers a drug that ...",37,3,24,0,10,13,10,3,0,10,0.0,0.648649,0.27027,0.491135


# Export to CSV

In [15]:
df.to_csv("writing_session_stats.csv")

# Write Sentences to Text Files

In [19]:
# for file_name, text in zip(df["file_name"], df["text"]):
#     file_name = file_name.split('.')[0] # Extract only the name and not the 'jsonl' part
#     file_name = "taaco-input-texts/" + file_name + ".txt"
#     with open(file_name, "w") as f:
#         f.write(text)

# Read TAACO Metrics CSV

In [45]:
taaco_df = pd.read_csv("results.csv")
taaco_df.head()

Unnamed: 0,Filename,lemma_ttr,lemma_mattr,lexical_density_tokens,lexical_density_types,content_ttr,verb_ttr,argument_ttr,bigram_lemma_ttr,trigram_lemma_ttr,...,negative_logical,all_temporal,positive_intentional,all_positive,all_negative,all_connective,pronoun_density,pronoun_noun_ratio,repeated_content_lemmas,repeated_content_and_pronoun_lemmas
0,007769c9000e457eae8485221041802d.txt,0.378713,0.683549,0.475248,0.751634,0.598958,0.5,0.31405,0.841191,0.965174,...,0.014851,0.014851,0.012376,0.061881,0.014851,0.066832,0.215347,2.023256,0.287129,0.502475
1,00bf170a815a42359f3aef35f5674ddc.txt,0.47185,0.803765,0.541555,0.75,0.653465,0.603448,0.610169,0.935484,0.991914,...,0.002681,0.008043,0.0,0.037534,0.008043,0.034853,0.0563,0.21,0.262735,0.313673
2,00d39011efcb4533ab12076801f74f42.txt,0.326816,0.638641,0.444134,0.717949,0.528302,0.587302,0.287037,0.789916,0.946629,...,0.011173,0.01676,0.002793,0.053073,0.011173,0.067039,0.148045,0.883333,0.301676,0.444134
3,0139e814be15409dbab46c2d2d9ca07f.txt,0.422131,0.753075,0.508197,0.762136,0.633065,0.571429,0.464286,0.86653,0.969136,...,0.004098,0.010246,0.010246,0.079918,0.010246,0.067623,0.131148,0.566372,0.272541,0.401639
4,01650a401e614c38a04a904165a5784f.txt,0.503571,0.715758,0.478571,0.70922,0.746269,0.666667,0.630952,0.874552,0.960432,...,0.007143,0.010714,0.0,0.053571,0.007143,0.046429,0.028571,0.126984,0.189286,0.196429


# Append TAACO Metrics to Original DataFrame 

In [46]:
taaco_df["file_name"] = taaco_df["Filename"].apply(lambda x: x.split(".")[0])
taaco_df.drop(["Filename"], inplace=True, axis=1)

In [47]:
new_df = df
new_df["file_name"] = new_df["file_name"].apply(lambda x: x.split(".")[0])

In [48]:
new_df = pd.merge(new_df, taaco_df, on="file_name")
new_df.head()

Unnamed: 0,file_name,text,Total number of sentences,Number of sentences of initial prompt,Number of sentences completely authored by the user,Number of sentences completely authored by GPT-3,Number of sentences authored by GPT-3 and user,Total number of GPT-3 calls made,Number of times GPT-3 suggestion is used,Number of times user rejected GPT-3 suggestion,...,negative_logical,all_temporal,positive_intentional,all_positive,all_negative,all_connective,pronoun_density,pronoun_noun_ratio,repeated_content_lemmas,repeated_content_and_pronoun_lemmas
0,8c11358444974bf0b5224183acd8149d,What Stereotypical Characters Make You Cringe?...,15,4,6,0,5,5,5,0,...,0.003937,0.011811,0.015748,0.07874,0.023622,0.094488,0.055118,0.245614,0.311024,0.358268
1,c7dc5563ed07478f9284190b6085f4d3,How Worried Should We Be About Screen Time Dur...,20,6,10,0,4,6,4,2,...,0.010204,0.017007,0.003401,0.07483,0.013605,0.088435,0.017007,0.058824,0.22449,0.22449
2,05a000131fc642f7bb20b62bb20a326e,"All of the ""#1 Dad"" mugs in the world change t...",22,1,18,0,3,6,3,3,...,0.002681,0.021448,0.002681,0.061662,0.010724,0.064343,0.045576,0.186813,0.22252,0.265416
3,7834dec912b34643afb92b7c3648a3fe,"When you die, you appear in a cinema with a nu...",32,2,18,1,11,12,12,0,...,0.007833,0.028721,0.005222,0.075718,0.007833,0.075718,0.065274,0.409836,0.263708,0.326371
4,105bf88bb4bc42688e06a54644e2989b,"When you're 28, science discovers a drug that ...",37,3,24,0,10,13,10,3,...,0.01773,0.019504,0.005319,0.072695,0.019504,0.072695,0.078014,0.478261,0.241135,0.31383


# Divide into High-usage and Low-usage Groups

In [50]:
print("Median of Amount of GPT-3 Usage:", np.median(new_df["Amount of GTP-3 Usage"]))

Median of Amount of GPT-3 Usage: 0.25


In [52]:
df_high = new_df[new_df["Amount of GTP-3 Usage"] > np.median(new_df["Amount of GTP-3 Usage"])]

In [53]:
df_low = new_df[new_df["Amount of GTP-3 Usage"] <= np.median(new_df["Amount of GTP-3 Usage"])]

# Print High-usage Group Metrics

In [62]:
for col in df_high.columns:
    if col in df.columns:
        continue
    print("Mean", col, ":", np.mean(df_high[col]))

Mean lemma_ttr : 0.39611582719532457
Mean lemma_mattr : 0.7182977377160918
Mean lexical_density_tokens : 0.4926949109058426
Mean lexical_density_types : 0.7296625080290305
Mean content_ttr : 0.586431661330968
Mean verb_ttr : 0.578616811517585
Mean argument_ttr : 0.42991633349094843
Mean bigram_lemma_ttr : 0.8239844002802577
Mean trigram_lemma_ttr : 0.937303613016993
Mean adjacent_overlap_all_sent : 0.2209816823547981
Mean adjacent_overlap_all_sent_div_seg : 3.093252830333542
Mean adjacent_overlap_binary_all_sent : 0.8559451530767535
Mean adjacent_overlap_cw_sent : 0.1177543106766741
Mean adjacent_overlap_cw_sent_div_seg : 0.9113462303415202
Mean adjacent_overlap_binary_cw_sent : 0.4937550454177287
Mean adjacent_overlap_verb_sent : 0.11226794017783301
Mean adjacent_overlap_verb_sent_div_seg : 0.27415818687174015
Mean adjacent_overlap_binary_verb_sent : 0.23139950351386926
Mean adjacent_overlap_argument_sent : 0.2216078188370723
Mean adjacent_overlap_argument_sent_div_seg : 0.93535281050

# Print Low-usage Group Metrics

In [63]:
for col in df_low.columns:
    if col in df.columns:
        continue
    print("Mean", col, ":", np.mean(df_low[col]))

Mean lemma_ttr : 0.40644688188353156
Mean lemma_mattr : 0.7358031241203737
Mean lexical_density_tokens : 0.4985475465362733
Mean lexical_density_types : 0.7250907969085123
Mean content_ttr : 0.5914766065546153
Mean verb_ttr : 0.575548092035228
Mean argument_ttr : 0.4392265208983434
Mean bigram_lemma_ttr : 0.8510042890397267
Mean trigram_lemma_ttr : 0.955060558344478
Mean adjacent_overlap_all_sent : 0.20340912779507023
Mean adjacent_overlap_all_sent_div_seg : 2.791734198145305
Mean adjacent_overlap_binary_all_sent : 0.8361618813735054
Mean adjacent_overlap_cw_sent : 0.10753751800784793
Mean adjacent_overlap_cw_sent_div_seg : 0.8208818073126567
Mean adjacent_overlap_binary_cw_sent : 0.45866299916347214
Mean adjacent_overlap_verb_sent : 0.10261800591127875
Mean adjacent_overlap_verb_sent_div_seg : 0.24924108947483087
Mean adjacent_overlap_binary_verb_sent : 0.2143809277784411
Mean adjacent_overlap_argument_sent : 0.2053213175200705
Mean adjacent_overlap_argument_sent_div_seg : 0.842961819

# Collect Group Metrics into New DataFrame

In [73]:
group_metrics = pd.DataFrame()

In [74]:
metrics = []
low_group_val = []
high_group_val = []

In [75]:
for col in new_df.columns:
    if col in df.columns:
        continue
    metrics.append(col)
    low_group_val.append(np.mean(df_low[col]))
    high_group_val.append(np.mean(df_high[col]))
group_metrics["Metric"] = metrics
group_metrics["Low Group Value (Mean)"] = low_group_val
group_metrics["High Group Value (Mean)"] = high_group_val

group_metrics.to_csv("Group Metrics.csv")

In [76]:
group_metrics

Unnamed: 0,Metric,Low Group Value (Mean),High Group Value (Mean)
0,lemma_ttr,0.406447,0.396116
1,lemma_mattr,0.735803,0.718298
2,lexical_density_tokens,0.498548,0.492695
3,lexical_density_types,0.725091,0.729663
4,content_ttr,0.591477,0.586432
5,verb_ttr,0.575548,0.578617
6,argument_ttr,0.439227,0.429916
7,bigram_lemma_ttr,0.851004,0.823984
8,trigram_lemma_ttr,0.955061,0.937304
9,adjacent_overlap_all_sent,0.203409,0.220982
