In [9]:
import pandas as pd
import numpy as np
import simpledorff
from collections import Counter
import os

# Part 1: Check Agreements in raw coding

In [1]:
# handcoded data
outpath = '../../data/handcoded'
filename1 = f'{outpath}/round1_coding_062223.csv'
filename2 = f'{outpath}/round2_coding_070123.csv'
filename3 = f'{outpath}/round3_coding_071023.csv'

filenames = [filename1, filename2, filename3]

settled_disagreement_file = f'{outpath}/settled_disagreements.txt'

In [2]:
def clean_raw(df):
    # set coder as index

    df.index = [coders[name] for name in df['RecipientLastName']]
    
        # drop columns we don't need
    skip = ['StartDate',
                'EndDate',
                'Status',
                'IPAddress',
                'Progress',
                'Duration (in seconds)',
                'Finished',
                'RecordedDate',
                'ResponseId',
                'RecipientLastName',
                'RecipientFirstName',
                'RecipientEmail',
                'ExternalReference',
                'LocationLatitude',
                'LocationLongitude',
                'DistributionChannel',
                'UserLanguage',
                'coder']

    keep = [col for col in df.columns if col not in skip]

    df = df[keep]
    
    # create sets of conversation_ids -- this will become the index
    on_topic_cols = set()

    for col in df.columns:
        if col not in skip:
            col_codes = col.split('_')

            if len(col_codes)==1:
                on_topic_cols.add(col)
       
    # create dictionary of pivoted data
    pivot_data = dict()

    for i, row in df.iterrows():

        for cid in on_topic_cols:
            pivot_data.setdefault(cid, dict())

            pivot_data[cid][f'{i}_on_topic'] = row[cid]
            pivot_data[cid][f'{i}_topic'] = row[f'{cid}_topic']
            
            try:
                pivot_data[cid][f'{i}_english'] = row[f'{cid}_english']
            except KeyError:
                pass
    
    final = pd.DataFrame.from_dict(pivot_data, orient='index')
    
    return final

In [3]:
def load_data(filename):
    df = pd.read_csv(filename)
    
    # cut extra header rows that qualtrics adds
    df = df.loc[2:4]
    
    df = clean_raw(df)
    
    return df

In [4]:
def clean_df(df):

    # move post id into a column
    df = df.reset_index(names='cid')
    
    keep = df.columns

    # drop duplicates, keeping the last entry
    df = df.drop_duplicates(subset='cid', keep='last')
    df = df.reset_index()
    print(f'{len(df)} total conversations coded')
    
    df = df.drop(np.where(df['Coder1_english']=='No')[0])
    df = df.reset_index()
    
    df = df[keep]
    
    return df

In [5]:
def get_krip(df, drop_ns=False):
    
    sub = df.melt(id_vars='cid', var_name='coder', value_name='code')
    
    if drop_ns:
        # drop not sure values and calc again
        sub = sub[sub['code']!='Not sure']

    # calculate
    krip = simpledorff.calculate_krippendorffs_alpha_for_df(sub,
                                                            experiment_col='cid',
                                                            annotator_col='coder',
                                                            class_col='code')
    
    if drop_ns:
        print(f'   Krippendorffs alpha (dropping not sure): {krip:.2%}')
    else:
        print(f'   Krippendorffs alpha: {krip:.2%}')

In [6]:
def get_agreement(df, drop_ns=False):
    
    agree = 0
    not_sures = 0
    
    # count agreements
    for row in df.itertuples():
        if row[-2] == row[-1]:
            agree += 1
        elif row[-2]=='Not sure' or row[-1]=='Not sure':
            not_sures +=1

    if drop_ns:
        denom = len(df) - not_sures
    else:
        denom = len(df)
    
    if drop_ns:
        print(f'   Percent agreement (dropping not sure): {agree/denom:.2%}')
    else:
        print(f'   Percent agreement: {agree/denom:.2%}')

In [7]:
def get_agreement_types(df):
    codes = list()

    # count agreements
    for row in sub.itertuples():
        code1 = row[-2]
        code2 = row[-1]

        try:
            code = '_'.join(sorted({code1, code2}))
            codes.append(code)
        except TypeError:
            pass

    # report agreements
    counts = Counter(codes)

    print('Agreements:')
    for item in ['Yes', 'No', 'Not sure']:
        count = counts[item]
        print(f'   {item}:\t{count} ({count/len(codes):.2%})')

    print('Disagreements:')
    for item in ['No_Not sure', 'Not sure_Yes', 'No_Yes']:
        count = counts[item]
        print(f'   {item}:\t{count} ({count/len(codes):.2%})')
        

In [10]:
# load initial coding waves
dfs = list()

# load all coding files
for i, filename in enumerate(filenames):
    df = load_data(filename)
    df['round'] = i + 1
    dfs.append(df)
    
# concatinate into single dataframe
df = pd.concat(dfs)

df = clean_df(df)

538 total conversations coded


In [12]:
# get agreements for inital waves
for val in range(4):
    sub = df[df['round']==val][['cid','Coder1_on_topic', 'Coder2_on_topic']]
    
    if len(sub)==0:
        sub=df[['cid','Coder1_on_topic', 'Coder2_on_topic']]
        print('For all data:')
    else:
        print(f'For round {val}:')
        
    get_krip(sub, drop_ns=False)
    get_agreement(sub, drop_ns=False)
    print()
    get_krip(sub, drop_ns=True)
    get_agreement(sub, drop_ns=True)
    print()

For all data:
   Krippendorffs alpha: 69.22%
   Percent agreement: 83.20%

   Krippendorffs alpha (dropping not sure): 78.81%
   Percent agreement (dropping not sure): 90.34%

For round 1:
   Krippendorffs alpha: 82.10%
   Percent agreement: 88.00%

   Krippendorffs alpha (dropping not sure): 100.00%
   Percent agreement (dropping not sure): 97.78%

For round 2:
   Krippendorffs alpha: 85.99%
   Percent agreement: 91.94%

   Krippendorffs alpha (dropping not sure): 100.00%
   Percent agreement (dropping not sure): 99.49%

For round 3:
   Krippendorffs alpha: 51.46%
   Percent agreement: 74.69%

   Krippendorffs alpha (dropping not sure): 56.07%
   Percent agreement (dropping not sure): 80.97%



In [13]:
get_agreement_types(df[['cid','Coder1_on_topic', 'Coder2_on_topic']])

Agreements:
   Yes:	52 (21.22%)
   No:	124 (50.61%)
   Not sure:	7 (2.86%)
Disagreements:
   No_Not sure:	7 (2.86%)
   Not sure_Yes:	12 (4.90%)
   No_Yes:	43 (17.55%)


## Part 1a: For On-Topic only, calculate topic agreement

In [12]:
on_topic = df[(df['Coder2_on_topic']=='Yes') & (df['Coder1_on_topic']=='Yes')][['cid','Coder1_topic', 'Coder2_topic']]

In [13]:
get_krip(on_topic, drop_ns=False)
get_agreement(on_topic, drop_ns=False)
print()
get_krip(on_topic, drop_ns=True)
get_agreement(on_topic, drop_ns=True)

   Krippendorffs alpha: 89.13%
   Percent agreement: 92.59%

   Krippendorffs alpha (dropping not sure): 91.11%
   Percent agreement (dropping not sure): 93.98%


# Part 2: Calculate final codes

In [14]:
final_on_topic = list()
final_topic = list()

for i, row in df.iterrows():
    
    # Final code -- is this on topic?
    if row['Coder2_on_topic'] == row['Coder1_on_topic']:
        final_on_topic.append(row['Coder2_on_topic'])
        
    else:
        final_on_topic.append(np.nan)
        
    # Final code -- what topic is this?
    if row['Coder2_topic'] == row['Coder1_topic']:
        final_topic.append(row['Coder2_topic'])
        
    else:
        final_topic.append(np.nan)
             
df['final_on_topic'] = final_on_topic
df['final_topic'] = final_topic

In [15]:
# outstanding on topic disagreements
dis = df[df['final_on_topic'].isna()]
print(f'{len(dis)} conversations have outstanding dissagreements as to whether they are on topic')

85 conversations have outstanding dissagreements as to whether they are on topic


In [16]:
# outstanding topic disagreements
dis_topic = df[(df['final_on_topic']=='Yes') & (df['final_topic'].isna())]
print(f'{len(dis_topic)} conversations have outstanding dissagreements as _what_ topic they are on')

10 conversations have outstanding dissagreements as _what_ topic they are on


# Part 2: Override settled disagreements

In [19]:
settled = pd.read_csv(settled_disagreement_file, sep='\t')



In [18]:
topic_dict = df[['cid', 'final_on_topic', 'final_topic']].set_index('cid').to_dict()

for i, row in settled.iterrows():
    cid = row['convo_id']
    
    topic_dict['final_on_topic'][cid] = row['final_on_topic']
    topic_dict['final_topic'][cid] = row['final_is_topic']

df['final_on_topic'] = df['cid'].apply(lambda x: topic_dict['final_on_topic'][x])
df['final_topic'] = df['cid'].apply(lambda x: topic_dict['final_topic'][x])

### Check on topic

In [19]:
# outstanding on topic disagreements
dis = df[df['final_on_topic'].isna()]
print(f'{len(dis)} conversations have outstanding dissagreements as to whether they are on topic')

2 conversations have outstanding dissagreements as to whether they are on topic


In [20]:
dis

Unnamed: 0,cid,Coder2_on_topic,Coder2_topic,Coder1_on_topic,Coder1_topic,round,Coder1_english,Coder2_english,final_on_topic,final_topic
323,1479861672483467265,Yes,Russia/Ukraine war,Not sure,Not sure,3,Not sure,No,,
351,1576911629324734470,No,Not sure,Yes,Childcare/parenting,3,Yes,Not sure,,


In [21]:
# overwrite final disagreements based on additional coding
df.loc[310, 'final_on_topic'] = 'Yes'
df.loc[310, 'final_topic'] = 'Russia/Ukraine war'

df.loc[385, 'final_on_topic'] = 'Yes'
df.loc[385, 'final_topic'] = 'Childcare/parenting'

In [22]:
# outstanding on topic disagreements
dis = df[df['final_on_topic'].isna()]
print(f'{len(dis)} conversations have outstanding dissagreements as to whether they are on topic')

2 conversations have outstanding dissagreements as to whether they are on topic


### Check identified topic

In [23]:
# outstanding topic disagreements
dis_topic = df[(df['final_on_topic']=='Yes') & (df['final_topic'].isna())]
print(f'{len(dis_topic)} conversations have outstanding dissagreements as _what_ topic they are on')

4 conversations have outstanding dissagreements as _what_ topic they are on


In [24]:
dis_topic

Unnamed: 0,cid,Coder2_on_topic,Coder2_topic,Coder1_on_topic,Coder1_topic,round,Coder1_english,Coder2_english,final_on_topic,final_topic
269,xvlf6r,Yes,"Russia/Ukraine war,Childcare/parenting",Yes,Russia/Ukraine war,3,Yes,Yes,Yes,
289,1584363437349797889,Yes,"US midterm elections,Childcare/parenting",Yes,Childcare/parenting,3,Yes,Yes,Yes,
299,1575543463218933765,Yes,"US midterm elections,Childcare/parenting",Yes,Childcare/parenting,3,Yes,Yes,Yes,
394,1575559800695685120,Yes,"US midterm elections,Childcare/parenting",Yes,Childcare/parenting,3,Yes,Yes,Yes,


In [25]:
# overwrite final disagreements based on additional coding
df.loc[316, 'final_topic'] = 'Russia/Ukraine war'
df.loc[287, 'final_topic'] = 'Childcare/parenting'
df.loc[309, 'final_topic'] = 'Childcare/parenting'
df.loc[335, 'final_topic'] = 'Childcare/parenting'

In [26]:
# outstanding topic disagreements
dis_topic = df[(df['final_on_topic']=='Yes') & (df['final_topic'].isna())]
print(f'{len(dis_topic)} conversations have outstanding dissagreements as _what_ topic they are on')

4 conversations have outstanding dissagreements as _what_ topic they are on


# Write final codes to file

In [27]:
df = df.rename(columns={'cid':'conversation_id'})
df = df[['conversation_id', 'final_on_topic', 'final_topic']]

df.to_csv(f'{outpath}/handcoded.txt', sep='\t', index=False)

In [28]:
df.head()

Unnamed: 0,conversation_id,final_on_topic,final_topic
0,xstvvm,Yes,Russia/Ukraine war
1,zz99ha,No,Not sure
2,y36718,No,
3,zt6v8o,No,
4,yhwnok,Not sure,
