In [1]:
from openpyxl import Workbook, load_workbook
from docx import Document
from string import punctuation
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt 

import os
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
color_2_num = {
    'YELLOW' : 7,
    'BRIGHT_GREEN' : 4,
    'DARK_YELLOW' : 14,
    'RED' : 6,
    'TURQUOISE' : 3,
    'GRAY' : 16
}

color_2_label = {
    'YELLOW' : "Thesis",
    'BRIGHT_GREEN' : "Organizational Framework",
    'DARK_YELLOW' : "Rhetorical Structure - Focus",
    'RED' : "Evidence",
    'TURQUOISE' : "Reasoning",
    'GRAY' : "Rhetorical Structure - Progression"
}

name_mapping = {"AC" : "Andrew Chapman",
                "AN" : "Ashlan Stewart",
                "BA" : "Beatriz Acosta-Tsvilin",
                "JM" : "Jasmine McCray",
                "JD" : "Jessica Dobbs",
                "MK" : "Maliha Kabir",
                "MB" : "Marina Banchetti",
                "NC" : "Nikki Chasteen",
                "SN" : "Sarah Nielsen"}

id_mapping = {"AC" : 'C',
               "AN" : 'D',
               "BA" : 'E',
               "JM" : 'F',
               "JD" : 'G',
               "MK" : 'H',
               "MB" : 'I',
               "NC" : 'J',
               "SN" : 'K'}

cust_punct = '!@#$%^&*()-=_+`~[]\{}|;:,./<>?' + '"' + "'" + '“”„‟‘’‚‛' + '\n' + ' ' # + ' '
alpha_num = '[^a-zA-Z0-9]'

In [3]:
root_folder = 'input_files_v3'
folder_names = os.listdir(root_folder + '/')

root_save_folder = 'output_files'

In [4]:
# creating a set of all file IDs
id_locations = dict()
for folder in folder_names:
    grader_folders = os.listdir(root_folder + '/' + folder + '/')
    for grader_folder in grader_folders:
        file_names = os.listdir(root_folder + '/' + folder + '/' + grader_folder + '/')
        for file in file_names:
            name, ext = os.path.splitext(file)
            if ext != '.docx':
                print('/' + grader_folder + '/' + file)
                continue
            file_id = file.split('-')[0]
            if file_id in id_locations:
                id_locations[file_id].append(root_folder + '/' + folder + '/' + grader_folder + '/' + file)
            else:
                id_locations[file_id] = [root_folder + '/' + folder + '/' + grader_folder + '/' + file]

print(f'Total Files - {len(id_locations)}')

Total Files - 192


In [5]:
total_string = []
para_number = 0
for file_id in id_locations.keys():    # ['09516']:
    total_words = []
    all_words = []
    rater_names = []
    for file_location in id_locations[file_id]:
        rater = file_location.split('/')[2].split()[1]
        doc = Document(file_location)
        prev, curr = 'normal_text', 'normal_text'
        # is_highlighted = False   # for checking if the current file has annotations or not.
        for paragraph in doc.paragraphs:
            runs = paragraph.runs
            n_runs = len(runs)
            temp_string = ""
            for i in range(n_runs):
                if runs[i].font.highlight_color:
                    # is_highlighted = is_highlighted or True   # if there is any run with a highlight, make is_highlighted = True
                    if runs[i].font.highlight_color == color_2_num['YELLOW']:
                        curr = color_2_label['YELLOW']
                    elif runs[i].font.highlight_color == color_2_num['BRIGHT_GREEN']:
                        curr = color_2_label['BRIGHT_GREEN']
                    elif runs[i].font.highlight_color == color_2_num['DARK_YELLOW']:
                        curr = color_2_label['DARK_YELLOW']
                    elif runs[i].font.highlight_color == color_2_num['RED']:
                        curr = color_2_label['RED']
                    elif runs[i].font.highlight_color == color_2_num['TURQUOISE']:
                        curr = color_2_label['TURQUOISE']
                    elif runs[i].font.highlight_color == color_2_num['GRAY']:
                        curr = color_2_label['GRAY']
                else:
                    curr = 'normal_text'
                    
                if curr == prev:
                    temp_string += runs[i].text #re.sub(alpha_num, ' ', runs[i].text)
                else:
                    total_string += [[file_id, rater, para_number, i + " ", prev] for i in temp_string.strip().split() if i.strip()]
                    temp_string = runs[i].text #re.sub(alpha_num, ' ', runs[i].text)
                    
                if i == n_runs - 1:
                    total_string += [[file_id, rater, para_number, i + " ", curr] for i in temp_string.strip().split() if i.strip()]

                prev = curr
            para_number += 1

In [6]:
# doc_loc = "input_files_v3/Annotated_Papers/Annotated AC 1/09474-FinalDraft-h2nl3.docx"
# doc1 = Document(doc_loc)

In [7]:
# total_string = []
# prev, curr = 'normal_text', 'normal_text'
# file_id = '09474'
# for paragraph in doc1.paragraphs:
#             runs = paragraph.runs
#             n_runs = len(runs)
#             temp_string = ""
#             for i in range(n_runs):
#                 if runs[i].font.highlight_color:
#                     # is_highlighted = is_highlighted or True   # if there is any run with a highlight, make is_highlighted = True
#                     if runs[i].font.highlight_color == color_2_num['YELLOW']:
#                         curr = color_2_label['YELLOW']
#                     elif runs[i].font.highlight_color == color_2_num['BRIGHT_GREEN']:
#                         curr = color_2_label['BRIGHT_GREEN']
#                     elif runs[i].font.highlight_color == color_2_num['DARK_YELLOW']:
#                         curr = color_2_label['DARK_YELLOW']
#                     elif runs[i].font.highlight_color == color_2_num['RED']:
#                         curr = color_2_label['RED']
#                     elif runs[i].font.highlight_color == color_2_num['TURQUOISE']:
#                         curr = color_2_label['TURQUOISE']
#                     elif runs[i].font.highlight_color == color_2_num['GRAY']:
#                         curr = color_2_label['GRAY']
#                 else:
#                     curr = 'normal_text'
                    
#                 if curr == prev:
#                     temp_string += runs[i].text #re.sub(alpha_num, ' ', runs[i].text)
#                 else:
#                     total_string += [[file_id, i + " ", prev] for i in temp_string.strip().split() if i.strip()]
#                     temp_string = runs[i].text #re.sub(alpha_num, ' ', runs[i].text)
                    
#                 if i == n_runs - 1:
#                     total_string += [[file_id, i + " ", curr] for i in temp_string.strip().split() if i.strip()]

#                 prev = curr
#         # total_words.append(len(total_string))
#         # all_words.append(total_string)
#         # rater_names.append(rater)

In [8]:
doc_df = pd.DataFrame(total_string, columns=['file_id', 'rater', 'para_number', 'text', 'label'])

In [9]:
# doc_df.groupby(['file_id', 'label']).sum()
# df = pd.DataFrame({'a': [1, 1, -1, 1, -1, -1]})
# print(df)
# print()
# print(df['a'].ne(df['a'].shift()).cumsum())

In [10]:
def create_group_ids(labels):
    group_ids = [0]  # Start with the first group id
    for i in range(1, len(labels)):
        if labels[i] == labels[i - 1]:
            group_ids.append(group_ids[-1])  # Same group as previous
        else:
            group_ids.append(group_ids[-1] + 1)  # New group id
    return group_ids

In [11]:
doc_df['group_id'] = create_group_ids(doc_df['label'])

aggregated_df = doc_df.groupby(['file_id', 'rater', 'para_number', 'group_id', 'label'], as_index=False)['text'].sum()

aggregated_df = aggregated_df.drop(columns=['para_number', 'group_id'])

temp = aggregated_df.assign(text=aggregated_df['text'].str.split('.')).explode('text', ignore_index=True) #.replace('', np.nan)
temp['text'].replace(' ', np.nan, inplace=True)
temp.dropna(subset=['text'], inplace=True)

temp.to_excel("output.xlsx", index=False)

In [26]:
temp = pd.read_excel("output.xlsx")
temp.head()

Unnamed: 0,file_id,rater,label,text
0,9474,AC,normal_text,Full Name
1,9474,AC,normal_text,Instructor
2,9474,AC,normal_text,ENC1101
3,9474,AC,normal_text,Date
4,9474,AC,normal_text,Overdosing on Stereotypes


In [19]:
temp['text'].isna().sum()

1309

In [20]:
temp = temp[temp['text'].notna()]

In [21]:
temp.loc[(temp['file_id'] == 9487) & (temp['rater'] == 'AC')].shape

(126, 4)

In [29]:
id_locations['09949']

['input_files_v3/Annotated_Papers/Annotated AC 7/09949-FinalDraft-7n797.docx',
 'input_files_v3/Annotated_Papers/Annotated AN 7/09949-FinalDraft-7n797.docx',
 'input_files_v3/Annotated_Papers/Annotated MB 7/09949-FinalDraft-7n797.docx']

In [32]:
for file_id in ['09949']: #id_locations.keys():
    int_file_id = int(file_id)
    print(int_file_id)
    for file_location in id_locations[file_id]:
        rater = file_location.split('/')[2].split()[1]
        print(f"  {rater} - ", temp.loc[(temp['file_id'] == int_file_id) & (temp['rater'] == rater)].shape)

9949
  AC -  (194, 4)
  AN -  (173, 4)
  MB -  (188, 4)


In [30]:
sample_id = 9949
ac = temp.loc[(temp['file_id'] == sample_id) & (temp['rater'] == 'AC')]
an = temp.loc[(temp['file_id'] == sample_id) & (temp['rater'] == 'AN')]
# jd = temp.loc[(temp['file_id'] == sample_id) & (temp['rater'] == 'JD')]
mb = temp.loc[(temp['file_id'] == sample_id) & (temp['rater'] == 'MB')]

In [48]:
i = 0
while mb.iloc[i].text.strip() == ac.iloc[i].text.strip():
    i += 1

for j in [-1, 0, 1]:
    print(i+j)
    print(mb.iloc[i+j].text)
    print(ac.iloc[i+j].text)
    print()

29
was "definitely and irreconcilably opposed to women's suffrage (Cobbs)
was "definitely and irreconcilably opposed to women's suffrage (Cobbs)

30
" This wasn't an 
" 

31
uncommon belief during the early 20th century, as women were basically denied any 
This wasn't an 



In [42]:
i-4

10