# Cohesion Statistics 

## Libraries

In [14]:
import pandas as pd
from tabulate import tabulate

## Reading file
The file_path in the cell below is the only thing you need to change in this notebook! Make sure to export as JSON in LabelStudio.

In [15]:
# reads json file and saves as pandas dataframe
file_path = 'project_vanja.json' 
anno_df = pd.read_json(file_path)

## Calculations

In [3]:
# create a dictionary based on the annotation scheme, where all values are set to zero
label_count_dic = {'Reference': 
                   {'personal': 0, 
                    'demonstrative': 0, 
                    'comparative': 0},
                   'Conjunction':
                   {'implicit': 0, 
                    'explicit': 0},
                   'Substitution and Ellipsis':
                   {'nominal': 0, 
                    'verbal': 0,
                    'clausal': 0},
                   'Lexical Organization': 
                   {'repetition': 0,
                    'synonymy': 0, 
                    'hyponymy': 0,
                    'meronymy': 0,
                    'collocation': 0}}

In [4]:
# counting 
for anno in anno_df['annotations'][0][0]['result']:
    for label in anno['value']['taxonomy']:
        label_count_dic[label[0]][label[1]] = label_count_dic[label[0]][label[1]] + 1

In [5]:
# calculates the total number of each cohesion type
for cohesion_type in label_count_dic.keys():
    label_count_dic[cohesion_type]['total count'] = sum(list(label_count_dic[cohesion_type].values())) 

In [6]:
# calculates total number of cohesive ties
total = 0
for cohesion_type in label_count_dic.keys():
    total = total + label_count_dic[cohesion_type]['total count']

In [7]:
# creates a list of lists containing the cohesion types, their total number, and their percentage 
cohesion_types_lst = []
for cohesion_type in label_count_dic.keys():
    cohesion_types_lst.append([cohesion_type, label_count_dic[cohesion_type]['total count'], label_count_dic[cohesion_type]['total count']/total*100])
cohesion_types_lst.append(['Total', total, 100])

# creates a table of the cohesion types based on the previous list
table = tabulate(
    cohesion_types_lst, 
    headers=["Cohesion Type", "N", "%"]
)

# prints the table
print(table)

Cohesion Type                N    %
-------------------------  ---  ---
Reference                    3   50
Conjunction                  3   50
Substitution and Ellipsis    0    0
Lexical Organization         0    0
Total                        6  100


The table below will display References Types as a % of Cohesive Ties.

In [8]:
# defining multiplication factor 
factor = label_count_dic['Reference']['total count']/total*100

# creates a list of lists containing the cohesion types, their total number, and their percentage 
types_lst = []
cohesion_type = 'Reference'
for subtype in label_count_dic[cohesion_type].keys():
    N = label_count_dic[cohesion_type][subtype]
    try:
        percent = N/label_count_dic[cohesion_type]['total count']*factor
    except ZeroDivisionError:
        percent = 0
    types_lst.append([subtype, N, percent])
    
# creates a table of the cohesion types based on the previous list
table = tabulate(
    types_lst, 
    headers=["Reference Type", "N", "%"]
)

# prints the table
print(table)

Reference Type      N    %
----------------  ---  ---
personal            3   50
demonstrative       0    0
comparative         0    0
total count         3   50


In [9]:
def table_types(cohesion_type, dic):
    '''
    takes a string representing the cohesion type and dictionary of the cohesion counts as input.
    returns a table of the types, their total number, and their percentage.
    '''
    # creates a list of lists containing the types, their total number, and their percentage
    types_lst = []
    for subtype in label_count_dic[cohesion_type].keys():
        N = label_count_dic[cohesion_type][subtype]
        try:
            percent = N/label_count_dic[cohesion_type]['total count']*100
        except ZeroDivisionError:
            percent = 0
        types_lst.append([subtype, N, percent])
    
    # creates a table of the conjunction types based on the previous list
    table = tabulate(
        types_lst, 
        headers=[cohesion_type+" Type", "N", "%"]
    )
    
    # prints the table
    print(table)

In [10]:
table_types('Conjunction', label_count_dic)

Conjunction Type      N         %
------------------  ---  --------
implicit              1   33.3333
explicit              2   66.6667
total count           3  100


In [11]:
table_types('Reference', label_count_dic)

Reference Type      N    %
----------------  ---  ---
personal            3  100
demonstrative       0    0
comparative         0    0
total count         3  100


In [12]:
table_types('Substitution and Ellipsis', label_count_dic)

Substitution and Ellipsis Type      N    %
--------------------------------  ---  ---
nominal                             0    0
verbal                              0    0
clausal                             0    0
total count                         0    0


In [13]:
table_types('Lexical Organization', label_count_dic)

Lexical Organization Type      N    %
---------------------------  ---  ---
repetition                     0    0
synonymy                       0    0
hyponymy                       0    0
meronymy                       0    0
collocation                    0    0
total count                    0    0
