This notebook helps with annotating the data: creating references summaries.
It sort out entities with less than 4 reviews and picks random ones for annotations. 

In [1]:
import pandas as pd
import numpy as np
import json
from nltk.tokenize import word_tokenize
import random
import os.path
import ipywidgets as widgets

In [2]:
data_location = "/Users/soid/Dropbox/Code/2020/columbia-catalog-data"
instructors_file = data_location + '/instructors/instructors.json'
dataset_file       = "/Users/soid/Dropbox/Code/2022/nlg-project/CuSum/culpa.json"
dataset_file_jsonl = "/Users/soid/Dropbox/Code/2022/nlg-project/CuSum/culpa.jsonl"
dataset_test_jsonl = "/Users/soid/Dropbox/Code/2022/nlg-project/CuSum/culpa.test.jsonl"

In [3]:
# read instructors list from columbia-catalog-data
f = open(instructors_file, 'r')
body = f.read()
f.close()
instructors = json.loads(body)

In [4]:
# read created references: name -> row that includes summary
references = {}  # list of names
if os.path.exists(dataset_test_jsonl):
    f = open(dataset_test_jsonl, 'r')
    for line in f.readlines():
        row = json.loads(line)
        references[row['name']] = row
    f.close()

In [5]:
# process columbia-catalog-data
min_reviews_threshold = 1
result = []
for row in instructors:
    out_instr = {}
    out_instr['name'] = row['name']
    out_instr['reviews'] = []
    if not row['culpa_reviews']:
        continue
    for r in row['culpa_reviews']:
        text = r['text'] + ". Workload: "
        if r['workload']:
            text += r['workload']
        text = text.lower()
#        text = " ".join(word_tokenize(text))
        
        out_instr['reviews'].append(text)
    if len(out_instr['reviews']) >= min_reviews_threshold:
        result.append(out_instr)


In [44]:
# filter out references from training data
def filter_referenced(result, references):
    result = [row for row in result if row['name'] not in references.keys()]
    return result
result = filter_referenced(result, references)

In [47]:
name = None
# uncomment to edit specific name
# name = "Patricia D Denison"
edit_summary = 0  # TODO support editing multiple summaries

if name:
    if name in references.keys():
        selected_row = references[name]
    else:
        selected_row = next(filter(lambda x: x['name'] == name, result))
else:
    # draw random professor for writing reference review
    selection = [rv for rv in result if len(rv['reviews']) >=4]
    selected_row = random.choice(selection)
    name = selected_row['name']
print("name:", name)
print("URL:", "https://peqod.com/prof/" + name.replace(" ", "_"))

ref = ""
if name in references.keys():
    ref = references[name]['summary']
    if len(ref) > 1:
        print("Name", name, "has more than one reference")
    ref = ref[edit_summary]

textarea = widgets.Textarea(
    value=ref,
    placeholder='Type reference summary for above professor %s' % name,
    description='Reference:',
    rows=5, layout={'width': '80%'},
    disabled=False
)
btn = widgets.Button(
    description='Submit',
    disabled=False
)
output = widgets.Output()

def on_button_clicked(b):
    references[name] = selected_row
    references[name]['summary'] = []
    references[name]['summary'].append(textarea.value)
    
    # save json per line
    f = open(dataset_test_jsonl, "w")
    for row in references.values():
        f.write(json.dumps(row) + "\n")
    f.close()
    
    with output:
        print("Reference created")

btn.on_click(on_button_clicked)
widgets.VBox([textarea, btn, output])


name: Sabrina M Jhanwar
URL: https://peqod.com/prof/Sabrina_M_Jhanwar


VBox(children=(Textarea(value='', description='Reference:', layout=Layout(width='80%'), placeholder='Type refe…

In [48]:
# filter out referenced profs
result = filter_referenced(result, references)

# save pretty json
b = json.dumps(result, indent=2)
f = open(dataset_file, "w")
f.write(b)
f.close()

# save json per line
f = open(dataset_file_jsonl, "w")
for obj in result:
    f.write(json.dumps(obj) + "\n")
f.close()

In [49]:
# calculate summaries
print("Total entities for summarization:", len(result))
print("Total reference summaries:", len(references.keys()))
total_reviews = sum(len(row['reviews']) for row in result)
print("Total reviews:", total_reviews)

total_sum4 = sum(1 for row in result if len(row['reviews']) >= 4)
print("Total summaries with #reviews >= 4:", total_sum4)
total_reviews4 = sum(len(row['reviews']) for row in result if len(row['reviews']) >= 4)
print("Total reviews for summaries with #reviews >= 4:", total_reviews4)

#print("avg tokens / summary:", tmp / total_reviews)
tmp = sum(len(word_tokenize(rv)) for row in result for rv in row['reviews'])
print("Avg tokens / review:", round(tmp / total_reviews, 1))


Total entities for summarization: 2142
Total reference summaries: 11
Total reviews: 17038
Total summaries with #reviews >= 4: 1151
Total reviews for summaries with #reviews >= 4: 15387
Avg tokens / review: 237.9
