In [1]:
from openpyxl import Workbook, load_workbook
from docx import Document
from string import punctuation
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt 

import os
import shutil
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
preference_order = ['MK', 'JM', 'AN', 'AC', 'SN', 'JD', 'MB', 'NC', 'BA']

In [3]:
source_folder = 'input_files_v3\Annotated_Papers'
destination_folder = 'input_files_v3\\new_annotations'

In [4]:
preference_order_reverse = preference_order[::-1]

In [5]:
# for rater in preference_order_reverse:
#     for folder in os.listdir(source_folder):
#         if rater in folder:
#             rater_folder_path = os.path.join(source_folder, folder)
#             for file in os.listdir(rater_folder_path):
#                 file_path = os.path.join(rater_folder_path, file)
#                 save_path = os.path.join(destination_folder, file)
#                 doc = Document(file_path)
#                 doc.save(save_path)

### Extract Annottaions

In [6]:
name_mapping = {"AC" : "Andrew Chapman",
                "AN" : "Ashlan Stewart",
                "BA" : "Beatriz Acosta-Tsvilin",
                "JM" : "Jasmine McCray",
                "JD" : "Jessica Dobbs",
                "MK" : "Maliha Kabir",
                "MB" : "Marina Banchetti",
                "NC" : "Nikki Chasteen",
                "SN" : "Sarah Nielsen"}

id_mapping = {"AC" : 'C',
               "AN" : 'D',
               "BA" : 'E',
               "JM" : 'F',
               "JD" : 'G',
               "MK" : 'H',
               "MB" : 'I',
               "NC" : 'J',
               "SN" : 'K'}

color_2_num = {
    'YELLOW' : 7,
    'BRIGHT_GREEN' : 4,
    'DARK_YELLOW' : 14,
    'RED' : 6,
    'TURQUOISE' : 3,
    'GRAY' : 16
}

color_2_label = {
    'YELLOW' : "Thesis",
    'BRIGHT_GREEN' : "Organizational Framework",
    'DARK_YELLOW' : "Rhetorical Structure - Focus",
    'RED' : "Evidence",
    'TURQUOISE' : "Reasoning",
    'GRAY' : "Rhetorical Structure - Progression"
}

cust_punct = '!@#$%^&*()-=_+`~[]\{}|;:,./<>?' + '"' + "'" + '“”„‟‘’‚‛' + '\n' + ' ' # + ' '
alpha_num = '[^a-zA-Z0-9]'

In [7]:
new_data = dict()
para_number = 0

for file_name in os.listdir(destination_folder):
    file_location = os.path.join(destination_folder, file_name)
    # rater = file_location.split('/')[2].split()[1]
    # print(file_location)
    doc = Document(file_location)
    total_string = []
    prev, curr = 'normal_text', 'normal_text'
    for paragraph in doc.paragraphs:
        runs = paragraph.runs
        n_runs = len(runs)
        temp_string = ""
        for i in range(n_runs):
            if runs[i].font.highlight_color:
                # is_highlighted = is_highlighted or True   # if there is any run with a highlight, make is_highlighted = True
                if runs[i].font.highlight_color == color_2_num['YELLOW']:
                    curr = color_2_label['YELLOW']
                elif runs[i].font.highlight_color == color_2_num['BRIGHT_GREEN']:
                    curr = color_2_label['BRIGHT_GREEN']
                elif runs[i].font.highlight_color == color_2_num['DARK_YELLOW']:
                    curr = color_2_label['DARK_YELLOW']
                elif runs[i].font.highlight_color == color_2_num['RED']:
                    curr = color_2_label['RED']
                elif runs[i].font.highlight_color == color_2_num['TURQUOISE']:
                    curr = color_2_label['TURQUOISE']
                elif runs[i].font.highlight_color == color_2_num['GRAY']:
                    curr = color_2_label['GRAY']
            else:
                curr = 'normal_text'
                
            if curr == prev:
                temp_string += runs[i].text # re.sub(alpha_num, ' ', runs[i].text)
            else:
                total_string += [[i, prev, para_number] for i in temp_string.strip().split() if i.strip()]
                temp_string = runs[i].text #re.sub(alpha_num, ' ', runs[i].text)
                
            if i == n_runs - 1:
                total_string += [[i, curr, para_number] for i in temp_string.strip().split() if i.strip()]

            prev = curr
        para_number += 1
    # total_words.append(len(total_string))
    # all_words.append(total_string)

    new_data[file_name] = total_string

In [9]:
from collections import defaultdict


def convert_dict_to_df(data_dict):
    rows = []

    for filename, word_info_list in data_dict.items():
        # Use a dictionary to group by (label, paragraph_number)
        grouped = defaultdict(list)
        for word, label, para_num in word_info_list:
            grouped[(label, para_num)].append(word)

        # Create rows: one per (label, paragraph_number) group
        for (label, para_num), words in grouped.items():
            sentence = " ".join(words)
            rows.append({
                "file_id": filename,
                "text": sentence,
                "label": label
            })

    return pd.DataFrame(rows)

In [10]:
# n = 5
# print(convert_dict_to_df(new_data).iloc[n]['filename'])
# print(convert_dict_to_df(new_data).iloc[n]['label'])
# print(len(convert_dict_to_df(new_data).iloc[n]['sentence'].split()))

In [11]:
annotated_sentences = convert_dict_to_df(new_data)

In [12]:
annotated_sentences.head(2)

Unnamed: 0,file_id,text,label
0,09474-FinalDraft-h2nl3.docx,Full Name,normal_text
1,09474-FinalDraft-h2nl3.docx,Instructor,normal_text


In [13]:
annotated_sentences['file_id'] = annotated_sentences['file_id'].apply(lambda x: x.split('.')[0])

In [14]:
# annotated_sentences.drop([annotated_sentences['sentence'] == 'Full Name'].index, inplace=True)
# annotated_sentences.drop([annotated_sentences['sentence'] == 'Instructor'].index, inplace=True)
# annotated_sentences.drop([annotated_sentences['sentence'] == 'ENC1101'].index, inplace=True)
# annotated_sentences.drop([annotated_sentences['sentence'] == 'Date'].index, inplace=True)

In [15]:
# annotated_sentences = annotated_sentences[annotated_sentences['text'] != 'Full Name']
# annotated_sentences = annotated_sentences[annotated_sentences['text'] != 'Instructor']
# annotated_sentences = annotated_sentences[annotated_sentences['text'] != 'ENC1101']
# annotated_sentences = annotated_sentences[annotated_sentences['text'] != 'Date']

In [16]:
annotated_sentences.head()

Unnamed: 0,file_id,text,label
0,09474-FinalDraft-h2nl3,Full Name,normal_text
1,09474-FinalDraft-h2nl3,Instructor,normal_text
2,09474-FinalDraft-h2nl3,ENC1101,normal_text
3,09474-FinalDraft-h2nl3,Date,normal_text
4,09474-FinalDraft-h2nl3,Overdosing on Stereotypes,normal_text


In [17]:
annotated_sentences.to_excel('output_files//new_annotated_sentences.xlsx', index=False)

In [18]:
sum(annotated_sentences['label'] == 'Thesis')

189

In [19]:
annotated_sentences[annotated_sentences['text'].apply(lambda x: len(x) <= 1)]

Unnamed: 0,file_id,text,label
185,09481-FinalDraft-tzjwq,.,normal_text
407,09491-FinalDraft-3jtiv,T,normal_text
582,09498-FinalDraft-a7p4b,.,normal_text
739,09508-FinalDraft-wuqvz,.,normal_text
879,09515-FinalDraft-8ruph,.,normal_text
1024,09522-FinalDraft-qujp1,T,normal_text
1043,09523-FinalDraft-pxbk7,.,normal_text
1120,09528-FinalDraft-yp64d,.,normal_text
1149,09529-FinalDraft-axcia,T,normal_text
1220,09532-FinalDraft-ldvor,.,normal_text
