In [1]:
from openpyxl import Workbook, load_workbook
from docx import Document
from string import punctuation
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt 

import os
import re
import copy
import sys

import warnings
warnings.filterwarnings('ignore')

In [5]:
root_folder = 'input_files_v3/Annotated_Papers'
folder_names = os.listdir(root_folder + '/')

root_save_folder = 'output_files'

In [8]:
# creating a set of all file IDs
id_locations = dict()
# for folder in folder_names:
# grader_folders = os.listdir(root_folder + '/' + folder + '/')
for grader_folder in folder_names:
    file_names = os.listdir(root_folder + '/' + grader_folder + '/')
    for file in file_names:
        name, ext = os.path.splitext(file)
        if ext != '.docx':
            print('/' + grader_folder + '/' + file)
            continue
        file_id = file.split('-')[0]
        if file_id in id_locations:
            id_locations[file_id].append(root_folder + '/' + grader_folder + '/' + file)
        else:
            id_locations[file_id] = [root_folder + '/' + grader_folder + '/' + file]

print(f'Total Files - {len(id_locations)}')

Total Files - 192


In [9]:
name_mapping = {"AC" : "Andrew Chapman",
                "AN" : "Ashlan Stewart",
                "BA" : "Beatriz Acosta-Tsvilin",
                "JM" : "Jasmine McCray",
                "JD" : "Jessica Dobbs",
                "MK" : "Maliha Kabir",
                "MB" : "Marina Banchetti",
                "NC" : "Nikki Chasteen",
                "SN" : "Sarah Nielsen"}

id_mapping = {"AC" : 'C',
               "AN" : 'D',
               "BA" : 'E',
               "JM" : 'F',
               "JD" : 'G',
               "MK" : 'H',
               "MB" : 'I',
               "NC" : 'J',
               "SN" : 'K'}

color_2_num = {
    'YELLOW' : 7,
    'BRIGHT_GREEN' : 4,
    'DARK_YELLOW' : 14,
    'RED' : 6,
    'TURQUOISE' : 3,
    'GRAY' : 16
}

color_2_label = {
    'YELLOW' : "Thesis",
    'BRIGHT_GREEN' : "Organizational Framework",
    'DARK_YELLOW' : "Rhetorical Structure - Focus",
    'RED' : "Evidence",
    'TURQUOISE' : "Reasoning",
    'GRAY' : "Rhetorical Structure - Progression"
}

cust_punct = '!@#$%^&*()-=_+`~[]\{}|;:,./<>?' + '"' + "'" + '“”„‟‘’‚‛' + '\n' + ' ' # + ' '
alpha_num = '[^a-zA-Z0-9]'

In [10]:
new_data = dict()
para_number = 0
for file_id in id_locations.keys():    # ['09516']:
    total_words = []
    all_words = []
    rater_names = []
    for file_location in id_locations[file_id]:
        rater = file_location.split('/')[2].split()[1]
        # print(file_location)
        doc = Document(file_location)
        total_string = []
        prev, curr = 'normal_text', 'normal_text'
        for paragraph in doc.paragraphs:
            runs = paragraph.runs
            n_runs = len(runs)
            temp_string = ""
            for i in range(n_runs):
                if runs[i].font.highlight_color:
                    # is_highlighted = is_highlighted or True   # if there is any run with a highlight, make is_highlighted = True
                    if runs[i].font.highlight_color == color_2_num['YELLOW']:
                        curr = color_2_label['YELLOW']
                    elif runs[i].font.highlight_color == color_2_num['BRIGHT_GREEN']:
                        curr = color_2_label['BRIGHT_GREEN']
                    elif runs[i].font.highlight_color == color_2_num['DARK_YELLOW']:
                        curr = color_2_label['DARK_YELLOW']
                    elif runs[i].font.highlight_color == color_2_num['RED']:
                        curr = color_2_label['RED']
                    elif runs[i].font.highlight_color == color_2_num['TURQUOISE']:
                        curr = color_2_label['TURQUOISE']
                    elif runs[i].font.highlight_color == color_2_num['GRAY']:
                        curr = color_2_label['GRAY']
                else:
                    curr = 'normal_text'
                    
                if curr == prev:
                    temp_string += re.sub(alpha_num, ' ', runs[i].text)
                else:
                    total_string += [[i, prev, para_number] for i in temp_string.strip().split() if i.strip()]
                    temp_string = re.sub(alpha_num, ' ', runs[i].text)
                    
                if i == n_runs - 1:
                    total_string += [[i, curr, para_number] for i in temp_string.strip().split() if i.strip()]

                prev = curr
            para_number += 1
        total_words.append(len(total_string))
        all_words.append(total_string)
        rater_names.append(rater)
    
    new_data[file_id] = all_words

In [6]:
def is_same(l1, l2):
    ret = True
    n1, n2 = len(l1), len(l2)
    if n1 != n2:
        ret = False
    else:
        for i in range(n1):
            if l1[i][0] != l2[i][0]:
                ret = False
                break
    return ret

In [7]:
def make_same(l1, l2):
    temp1, temp2 = copy.deepcopy(l1), copy.deepcopy(l2)
    try:
        i = 0
        while not is_same(l1, l2):
            if l1[i][0] != l2[i][0]:
                if l1[i][0]+l1[i+1][0] == l2[i][0]:
                    l1[i][0] = l1[i][0]+l1[i+1][0]
                    l1[i][1] = l1[i+1][1]
                    l1.pop(i+1)
                elif l1[i][0] == l2[i][0]+l2[i+1][0]:
                    l2[i][0] = l2[i][0]+l2[i+1][0]
                    l2[i][1] = l2[i+1][1]
                    l2.pop(i+1)
            i = i + 1
        success = True
    except:
        l1, l2 = temp1, temp2

In [8]:
def max_frequency(numbers):
    from collections import Counter

    if not numbers:
        return -1
    
    # Handle the case where there is only one element in the list or all elements are the same
    if len(numbers) == 1 or len(set(numbers)) == 1:
        return numbers[0]
    
    # Count the frequency of each number in the list
    frequency = Counter(numbers)
    
    # Get the frequencies in a list
    frequency_values = list(frequency.values())
    
    # Check if all frequencies are the same
    if all(frequency_value == frequency_values[0] for frequency_value in frequency_values):
        return -1
    
    # Find the number with the maximum frequency
    max_frequency = max(frequency_values)
    
    # Get the number corresponding to the maximum frequency
    for number, freq in frequency.items():
        if freq == max_frequency:
            return number

In [9]:
max_frequency([1,1,1,])

1

In [10]:
for _ in range(3):
    all_same = 0
    not_same = 0
    diff_len = 0
    
    for file_id in new_data.keys():
        words_list = new_data[file_id]
        if len(words_list) > 1:
            n = len(words_list)
            for i in range(0, n-1):
                for j in range(i+1, n):
                    words_1 = words_list[i]
                    words_2 = words_list[j]
                    
                    if len(words_1) != len(words_2):
                        diff_len += 1
                        make_same(words_1, words_2)
                    else:
                        z = 0
                        while z < len(words_1) and (words_1[z][0] == words_2[z][0]):
                            z = z + 1
                        if z == len(words_1):
                            all_same += 1
                        else:
                            not_same += 1
                            make_same(words_1, words_2)
    
    print(all_same, not_same, diff_len)

319 16 268
521 8 74
521 8 74


In [17]:
final_data = dict()
for file_id in new_data.keys():
    len_count = []
    words_lists = new_data[file_id]
    for words_list in words_lists:
        len_count.append(len(words_list))

    common_len = max_frequency(len_count)
    common_len_words = []

    for words_list in words_lists:
        if len(words_list) == common_len:
            common_len_words.append(words_list)

    if len(common_len_words) == 1:
        final_data[file_id] = common_len_words[0]
    else:
        n = len(common_len_words)
        words = []
        for i in range(common_len):
            word = common_len_words[0][i][0]
            para_number = common_len_words[0][i][2]
            word_lables = []
            for j in range(n):
                # print(i,j)
                word_lables.append(common_len_words[j][i][1])
                freq_label = max_frequency(word_lables)
            if freq_label == -1:
                freq_label = word_lables[0]
            words.append([word, freq_label, para_number])
        final_data[file_id] = words

In [19]:
i = 0
temp = []
for file_id in final_data.keys():
    asas = set()
    for aa in final_data[file_id]:
        asas.add(aa[1])
    if len(asas) >= 2:
        for aa in final_data[file_id]:
            temp.append([file_id, aa[0] + " ", aa[1], aa[2]])

In [20]:
df = pd.DataFrame(temp, columns=['file_id', 'text', 'label', 'para_number'])

In [29]:
group_key = (df['label'] != df['label'].shift(1)).cumsum()
final_df = df.groupby(['file_id', 'para_number', 'label'], as_index=False)['text'].sum().drop(columns=['para_number'])
 #.groupby(group_key, as_index=False).agg({'file_id': 'first', 'para_number': 'first', 'text': 'sum', 'label': 'first'}).drop(columns=['para_number'])

In [30]:
final_df.head()

Unnamed: 0,file_id,label,text
0,9474,normal_text,Full Name
1,9474,normal_text,Instructor
2,9474,normal_text,ENC1101
3,9474,normal_text,Date
4,9474,normal_text,Overdosing on Stereotypes


In [31]:
final_df.to_excel(root_save_folder + '/' +'consistent_reannotation.xlsx', index=False)