# Import Libraries

In [None]:
from tqdm import tqdm
import pandas as pd
import nltk
from nltk import word_tokenize
import string

from utils import load_sessions, read_session
from main import generate_buffer
from events import generate_event_seq
from summary import stats

# Compute summary statistics

In [None]:
sessions = load_sessions()
# sessions = load_sessions()[:10]

file_name = []
text = []
sentence_metrics_list = []
api_metrics_list = []

err = []

for sess in tqdm(sessions):
    events = read_session(sess, verbose=0)
    try:
        text_buffer = generate_buffer(events)
    except:
        err.append(str(sess.split('/')[-1]) + " is throwing an error!")
        continue
    file_name.append(sess.split('/')[-1])
    text.append(text_buffer[-1])
    event_seq_dict = generate_event_seq(buffer=text_buffer,
                                        events=events)
    sentence_metrics, api_metrics = stats(event_seq_dict)
    sentence_metrics_list.append(sentence_metrics)
    api_metrics_list.append(api_metrics)
    
for e in err:
    print(e)
    
df = pd.DataFrame()

df["file_name"] = file_name
df["text"] = text

for col in sentence_metrics_list[0]:
    df[str(col)] = [x[col] for x in sentence_metrics_list]
    
for col in api_metrics_list[0]:
    df[str(col)] = [x[col] for x in api_metrics_list]

# Ratios

In [None]:
def get_ratio(num1, num2):
    return float(num1 / num2)

In [None]:
# GPT-3 : Total Sentences

df["GPT-3 : Total Sentences"] = list(map(get_ratio, 
    df["Number of sentences completely authored by GPT-3"], 
    df["Total number of sentences"]
))

df["GPT-3 : Total Sentences"]

In [None]:
# User : Total Sentences

df["User : Total Sentences"] = list(map(get_ratio, 
    df["Number of sentences completely authored by the user"], 
    df["Total number of sentences"]
))

df["User : Total Sentences"]

In [None]:
# Type Token Ratio

def get_ttr(text):
    sentence_tokens = word_tokenize(text)
    punctuations = list(string.punctuation)
    sentence_tokens_clean = [word for word in sentence_tokens if word not in punctuations]
    ttr = len(set(sentence_tokens_clean)) / len(sentence_tokens_clean)
    return ttr


df["Type Token Ratio"] = df["text"].apply(get_ttr)
df["Type Token Ratio"]

# Mean

In [None]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Mean of", col, ":", np.mean(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Mean of", col, ":", np.mean(df[col]))
    
print("\nRatios")
print("Mean of GPT-3 / Total Sentences : ", np.mean(df["GPT-3 : Total Sentences"]))
print("Mean of User / Total Sentences : ", np.mean(df["User : Total Sentences"]))
print("Mean of Type Token Ratio : ", np.mean(df["Type Token Ratio"]))

# Median

In [None]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Median of", col, ":", np.median(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Median of", col, ":", np.median(df[col]))

print("\nRatios")
print("Median of GPT-3 / Total Sentences : ", np.median(df["GPT-3 : Total Sentences"]))
print("Median of User / Total Sentences : ", np.median(df["User : Total Sentences"]))
print("Median of Type Token Ratio : ", np.median(df["Type Token Ratio"]))

# Standard Deviation

In [None]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Standard Deviation of", col, ":", np.std(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Standard Deviation of", col, ":", np.std(df[col]))
    
print("\nRatios")
print("Standard Deviation of GPT-3 / Total Sentences : ", np.std(df["GPT-3 : Total Sentences"]))
print("Standard Deviation of User / Total Sentences : ", np.std(df["User : Total Sentences"]))
print("Standard Deviation of Type Token Ratio : ", np.std(df["Type Token Ratio"]))

# Minimum

In [None]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Minimum of", col, ":", np.min(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Minimum of", col, ":", np.min(df[col]))
    
print("\nRatios")
print("Minimum of GPT-3 / Total Sentences : ", np.min(df["GPT-3 : Total Sentences"]))
print("Minimum of User / Total Sentences : ", np.min(df["User : Total Sentences"]))
print("Minimum of Type Token Ratio : ", np.min(df["Type Token Ratio"]))

# Maximum

In [None]:
import numpy as np

print("Sentence Metrics")
for col in sentence_metrics_list[0]:
    print("Maximum of", col, ":", np.max(df[col]))
    
print("\nAPI Metrics")
for col in api_metrics_list[0]:
    print("Maximum of", col, ":", np.max(df[col]))
    
print("\nRatios")
print("Maximum of GPT-3 / Total Sentences : ", np.max(df["GPT-3 : Total Sentences"]))
print("Maximum of User / Total Sentences : ", np.max(df["User : Total Sentences"]))
print("Maximum of Type Token Ratio : ", np.max(df["Type Token Ratio"]))

# Correlation

In [None]:
df.corr()

# Export to CSV

In [None]:
df.to_csv("writing_session_stats.csv")