# Analysis of LLM-assisted human validation

In [2]:
import pandas as pd
import numpy as np
import simpledorff
from scipy.stats import ttest_ind

### Read in annotation data

Half of the full validation set has been assisted and the other half has not been assisted. All 3 coders coded the unasisted set (df4), while coder 1 coded LLM-assited by LlaMA 7B, coder 2 coded LLM-assisted set by LlaMA 13B, and coder 3 coded LLM-assisted by both LlaMA 7B and Llama 13. Importantly these were all the same set of annotations.

In [3]:
df1 = pd.read_csv('../LLM_7B_assisted.csv')

In [496]:
df2 = pd.read_csv('../LLM_13B_assisted.csv')

In [497]:
df3 = pd.read_csv('../LLM_mixed_assisted.csv')

In [311]:
df4 = pd.read_csv('../LLM_not_assisted.csv')

### Calculate within annotator differences between the assisted versus the unassisted validation set 

### 1. Annotation speed

#### Annotator A



##### For LLM-assisted 7B

In [59]:
#get avarege time to complete tasks overall

# Assuming your DataFrame is named df1
mean_time = df1.loc[df1['variable'].isin(['topic_match', 'event_match']), 'seconds'].mean()
print("Mean time for completing topic-level and event-level matches:", mean_time)

Mean time for completing topic-level and event-level matches: 12.855041414141414


In [19]:
#get avarege time to complete tasks for topic-level match

# Assuming your DataFrame is named df1
topic_mean_time = df1.loc[df1['variable'] == 'topic_match', 'seconds'].mean()

print("Average time to complete topic-level matches:", topic_mean_time)


Average time to complete topic-level matches: 23.939715151515152


In [20]:
#get avarege time to complete tasks event-level match
# Assuming your DataFrame is named df1
event_mean_time = df1.loc[df1['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches:", event_mean_time)


Average time to complete event-level matches: 1.770367676767677


In [None]:
#get avarege time to complete tasks event-level match
# Assuming your DataFrame is named df1
event_mean_time = df1.loc[df1['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches:", event_mean_time)


In [64]:
#get avarege time to complete confidence overall

mean_time = df1.loc[df1['variable'].isin(['confidence_1', 'confidence_2']), 'seconds'].mean()
print("Mean time for completing confidence rating for topic-level and event-level matches:", mean_time)

Mean time for completing confidence rating for topic-level and event-level matches: 1.339130303030303


##### For not assisted 

In [65]:
#get only the codings of annotator 1

# Assuming your DataFrame is named df4
df1_1 = df4[df4['coder_id'] == 2608]

In [66]:
# Assuming your DataFrame is named df1
mean_time = df1_1.loc[df1_1['variable'].isin(['topic_match', 'event_match']), 'seconds'].mean()
print("Mean time for completing topic-level and event-level matches unassisted:", mean_time)

Mean time for completing topic-level and event-level matches unassisted: 6.448849494949494


In [28]:
#get avarege time to complete tasks for topic-level match

# Assuming your DataFrame is named df1
topic_mean_time = df1_1.loc[df1_1['variable'] == 'topic_match', 'seconds'].mean()

print("Average time to complete topic-level matches unassisted:", topic_mean_time)


Average time to complete topic-level matches unassited: 11.173923232323231


In [29]:
#get avarege time to complete tasks event-level match
# Assuming your DataFrame is named df1
event_mean_time = df1_1.loc[df1_1['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches unassisted:", event_mean_time)


Average time to complete event-level matches: 1.7237757575757575


In [68]:
#get avarege time to complete confidence overall

mean_time = df1_1.loc[df1_1['variable'].isin(['confidence_1', 'confidence_2']), 'seconds'].mean()
print("Mean time for completing confidence rating for topic-level and event-level matches:", mean_time)

Mean time for completing confidence rating for topic-level and event-level matches: 1.5567313131313132


#### Annotator B

#### For LLM-assisted 13B

In [69]:
#get avarege time to complete tasks overall

# Assuming your DataFrame is named df1
mean_time = df2.loc[df2['variable'].isin(['topic_match', 'event_match']), 'seconds'].mean()
print("Mean time for completing topic-level and event-level matches:", mean_time)

Mean time for completing topic-level and event-level matches: 17.430112121212122


In [70]:
#get avarege time to complete tasks for topic-level match

# Assuming your DataFrame is named df1
topic_mean_time = df2.loc[df2['variable'] == 'topic_match', 'seconds'].mean()

print("Average time to complete topic-level matches:", topic_mean_time)


Average time to complete topic-level matches: 20.77467676767677


In [71]:
#get avarege time to complete tasks for event-level match

# Assuming your DataFrame is named df1
event_mean_time = df2.loc[df2['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches:", event_mean_time)


Average time to complete event-level matches: 14.085547474747475


In [72]:
#get avarege time to complete confidence overall

mean_time = df2.loc[df2['variable'].isin(['confidence_1', 'confidence_2']), 'seconds'].mean()
print("Mean time for completing confidence rating for topic-level and event-level matches:", mean_time)

Mean time for completing confidence rating for topic-level and event-level matches: 1.8730373737373738


#### For not assisted 

In [73]:
#get only the codings of annotator 2

# Assuming your DataFrame is named df4
df2_1 = df4[df4['coder_id'] == 2606]

In [74]:
# Assuming your DataFrame is named df1
mean_time = df2_1.loc[df2_1['variable'].isin(['topic_match', 'event_match']), 'seconds'].mean()
print("Mean time for completing topic-level and event-level matches unassisted:", mean_time)

Mean time for completing topic-level and event-level matches unassisted: 14.81311111111111


In [75]:
#get avarege time to complete tasks for topic-level match

# Assuming your DataFrame is named df1
topic_mean_time = df2_1.loc[df2_1['variable'] == 'topic_match', 'seconds'].mean()

print("Average time to complete topic-level matches unassisted:", topic_mean_time)

Average time to complete topic-level matches unassisted: 20.280268686868688


In [44]:
#get avarege time to complete tasks event-level match
# Assuming your DataFrame is named df1
event_mean_time = df2_1.loc[df2_1['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches unassisted:", event_mean_time)

Average time to complete event-level matches unassisted: 9.345953535353535


In [76]:
#get avarege time to complete confidence overall

mean_time = df2_1.loc[df2_1['variable'].isin(['confidence_1', 'confidence_2']), 'seconds'].mean()
print("Mean time for completing confidence rating for topic-level and event-level matches:", mean_time)

Mean time for completing confidence rating for topic-level and event-level matches: 2.872646464646465


#### Annotator C

#### For LLM_mixed partially assisted

In [77]:
#get avarege time to complete tasks overall

# Assuming your DataFrame is named df1
mean_time = df3.loc[df3['variable'].isin(['topic_match', 'event_match']), 'seconds'].mean()
print("Mean time for completing topic-level and event-level matches:", mean_time)

Mean time for completing topic-level and event-level matches: 5.112763636363637


In [78]:
#get avarege time to complete tasks for topic-level match

# Assuming your DataFrame is named df1
topic_mean_time = df3.loc[df3['variable'] == 'topic_match', 'seconds'].mean()

print("Average time to complete topic-level matches:", topic_mean_time)


Average time to complete topic-level matches: 8.849515151515153


In [79]:
#get avarege time to complete tasks for event-level match

# Assuming your DataFrame is named df1
event_mean_time = df3.loc[df3['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches:", event_mean_time)


Average time to complete event-level matches: 1.3760121212121212


In [80]:
#get avarege time to complete confidence overall

mean_time = df3.loc[df3['variable'].isin(['confidence_1', 'confidence_2']), 'seconds'].mean()
print("Mean time for completing confidence rating for topic-level and event-level matches:", mean_time)

Mean time for completing confidence rating for topic-level and event-level matches: 1.2556757575757578


In [82]:
df4['coder_id'].unique()

array([2608, 2607, 2606])

#### For not assisted 

In [83]:
#get only the codings of annotator 2

# Assuming your DataFrame is named df4
df3_1 = df4[df4['coder_id'] == 2607]

In [84]:
# Assuming your DataFrame is named df1
mean_time = df3_1.loc[df3_1['variable'].isin(['topic_match', 'event_match']), 'seconds'].mean()
print("Mean time for completing topic-level and event-level matches unassisted:", mean_time)

Mean time for completing topic-level and event-level matches unassisted: 9.004171717171719


In [85]:
#get avarege time to complete tasks for topic-level match

# Assuming your DataFrame is named df1
topic_mean_time = df3_1.loc[df3_1['variable'] == 'topic_match', 'seconds'].mean()

print("Average time to complete topic-level matches unassisted:", topic_mean_time)

Average time to complete topic-level matches unassisted: 14.414521212121212


In [86]:
#get avarege time to complete tasks event-level match
# Assuming your DataFrame is named df1
event_mean_time = df3_1.loc[df3_1['variable'] == 'event_match', 'seconds'].mean()

print("Average time to complete event-level matches unassisted:", event_mean_time)

Average time to complete event-level matches unassisted: 3.5938222222222223


In [87]:
#get avarege time to complete confidence overall

mean_time = df3_1.loc[df3_1['variable'].isin(['confidence_1', 'confidence_2']), 'seconds'].mean()
print("Mean time for completing confidence rating for topic-level and event-level matches:", mean_time)

Mean time for completing confidence rating for topic-level and event-level matches: 1.5022444444444445


#### Within Annotators comparison (annotation speed)

In [101]:
#function to calculate independent samples t-test within annotators

def perform_independent_t_test(df1, df2, test_type='overall'):
    # Exclude rows where 'variable' is 'confidence'
    filtered_df1 = df1[~((df1['variable'] == 'confidence_1') | (df1['variable'] == 'confidence_2'))]
    filtered_df2 = df2[~((df2['variable'] == 'confidence_1') | (df2['variable'] == 'confidence_2'))]

    if test_type == 'overall':
        variable_name = 'Overall'
    elif test_type == 'topic_match':
        filtered_df1 = filtered_df1[filtered_df1['variable'] == 'topic_match']
        filtered_df2 = filtered_df2[filtered_df2['variable'] == 'topic_match']
        variable_name = 'Topic Match'
    elif test_type == 'event_match':
        filtered_df1 = filtered_df1[filtered_df1['variable'] == 'event_match']
        filtered_df2 = filtered_df2[filtered_df2['variable'] == 'event_match']
        variable_name = 'Event Match'
    else:
        raise ValueError("Invalid test_type. Use 'overall', 'topic_match', or 'event_match'.")
    
    # Extract 'seconds' column
    seconds_df1 = filtered_df1['seconds']
    seconds_df2 = filtered_df2['seconds']

    # Perform t-test
    t_stat, p_value = ttest_ind(seconds_df1, seconds_df2)

    # Calculate means and standard deviations
    mean_df1 = seconds_df1.mean()
    mean_df2 = seconds_df2.mean()
    std_df1 = seconds_df1.std()
    std_df2 = seconds_df2.std()

    # Print the statistics
    print(f"Mean of df1 '{variable_name}' seconds: {mean_df1}")
    print(f"Standard deviation of df1 '{variable_name}' seconds: {std_df1}")
    print()
    print(f"Mean of df2 '{variable_name}' seconds: {mean_df2}")
    print(f"Standard deviation of df2 '{variable_name}' seconds: {std_df2}")
    print()
    print(f"T-statistic for '{variable_name}': {t_stat}")
    print(f"P-value for '{variable_name}': {p_value}")

    # Check p-value
    if p_value < 0.05:
        print(f"The difference in '{variable_name}' seconds is statistically significant.")
    else:
        print(f"There is no statistically significant difference in '{variable_name}' seconds.")


#### Annotator A

In [97]:
perform_independent_t_test(df1, df1_1, test_type='overall')
perform_independent_t_test(df1, df1_1, test_type='topic_match')
perform_independent_t_test(df1, df1_1, test_type='event_match')

Mean of df1 'Overall' seconds: 12.855041414141414
Standard deviation of df1 'Overall' seconds: 130.2492764574991

Mean of df2 'Overall' seconds: 6.448849494949494
Standard deviation of df2 'Overall' seconds: 19.538858852926786

T-statistic for 'Overall': 1.5304172318196783
P-value for 'Overall': 0.1260733890837042
There is no statistically significant difference in 'Overall' seconds.
Mean of df1 'Topic Match' seconds: 23.939715151515152
Standard deviation of df1 'Topic Match' seconds: 183.58216538084721

Mean of df2 'Topic Match' seconds: 11.173923232323231
Standard deviation of df2 'Topic Match' seconds: 26.673838939139973

T-statistic for 'Topic Match': 1.5310288770978204
P-value for 'Topic Match': 0.12608236321371927
There is no statistically significant difference in 'Topic Match' seconds.
Mean of df1 'Event Match' seconds: 1.770367676767677
Standard deviation of df1 'Event Match' seconds: 3.9295046295647627

Mean of df2 'Event Match' seconds: 1.7237757575757575
Standard deviation 

#### Annotator B

In [98]:
perform_independent_t_test(df2, df2_1, test_type='overall')
perform_independent_t_test(df2, df2_1, test_type='topic_match')
perform_independent_t_test(df2, df2_1, test_type='event_match')

Mean of df1 'Overall' seconds: 17.430112121212122
Standard deviation of df1 'Overall' seconds: 78.71832765881116

Mean of df2 'Overall' seconds: 14.81311111111111
Standard deviation of df2 'Overall' seconds: 85.22320782694844

T-statistic for 'Overall': 0.7097507998217069
P-value for 'Overall': 0.47794234154057214
There is no statistically significant difference in 'Overall' seconds.
Mean of df1 'Topic Match' seconds: 20.77467676767677
Standard deviation of df1 'Topic Match' seconds: 73.00063209276952

Mean of df2 'Topic Match' seconds: 20.280268686868688
Standard deviation of df2 'Topic Match' seconds: 68.70245099855636

T-statistic for 'Topic Match': 0.10972969298119084
P-value for 'Topic Match': 0.9126460340869996
There is no statistically significant difference in 'Topic Match' seconds.
Mean of df1 'Event Match' seconds: 14.085547474747475
Standard deviation of df1 'Event Match' seconds: 83.98919033966345

Mean of df2 'Event Match' seconds: 9.345953535353535
Standard deviation of d

#### Annotator C

In [99]:
perform_independent_t_test(df3, df3_1, test_type='overall')
perform_independent_t_test(df3, df3_1, test_type='topic_match')
perform_independent_t_test(df3, df3_1, test_type='event_match')

Mean of df1 'Overall' seconds: 5.112763636363637
Standard deviation of df1 'Overall' seconds: 40.26040445706177

Mean of df2 'Overall' seconds: 9.004171717171719
Standard deviation of df2 'Overall' seconds: 68.73251134613238

T-statistic for 'Overall': -1.5371157262553772
P-value for 'Overall': 0.12442491121593799
There is no statistically significant difference in 'Overall' seconds.
Mean of df1 'Topic Match' seconds: 8.849515151515153
Standard deviation of df1 'Topic Match' seconds: 56.69286271113746

Mean of df2 'Topic Match' seconds: 14.414521212121212
Standard deviation of df2 'Topic Match' seconds: 96.69664705324132

T-statistic for 'Topic Match': -1.1045838980622236
P-value for 'Topic Match': 0.2696088598239705
There is no statistically significant difference in 'Topic Match' seconds.
Mean of df1 'Event Match' seconds: 1.3760121212121212
Standard deviation of df1 'Event Match' seconds: 1.7371444202181017

Mean of df2 'Event Match' seconds: 3.5938222222222223
Standard deviation of

### 2. Annotation confidence

In [105]:
#get rows where only confidence metrics present

desired_variables = ['confidence_1', 'confidence_2']
filtered_df1 = df1[df1['variable'].isin(desired_variables)]


In [182]:
def preprocess_and_calculate_mean(df):
    # Assuming your DataFrame is named df
    desired_variables = ['confidence_1', 'confidence_2']
    filtered_df = df[df['variable'].isin(desired_variables)]

    # Recode values
    recode_map = {'--': 0, '-': 1, '+/-': 2, '+': 3, '++': 4}
    filtered_df['value'] = filtered_df['value'].replace(recode_map)

    # Pivot the DataFrame
    pivoted_df = filtered_df.pivot_table(index='unit_id',
                                         columns='variable',
                                         values='value',
                                         aggfunc='first').reset_index()

    # If you want to fill NaN values with something, you can use the fillna method
    pivoted_df.fillna(0, inplace=True)

    # Convert columns to numeric
    pivoted_df['confidence_1'] = pd.to_numeric(pivoted_df['confidence_1'])
    pivoted_df['confidence_2'] = pd.to_numeric(pivoted_df['confidence_2'])

    # Set 'unit_id' as the index
    pivoted_df.set_index('unit_id', inplace=True)

    # Calculate the mean of 'confidence_1' and 'confidence_2'
    mean_confidence_1 = pivoted_df['confidence_1'].mean()
    mean_confidence_2 = pivoted_df['confidence_2'].mean()

    # Calculate the overall mean
    overall_mean = pivoted_df[['confidence_1', 'confidence_2']].values.flatten().mean()

    return pivoted_df, mean_confidence_1, mean_confidence_2, overall_mean

#### Annotator A - assisted

In [180]:
processed_df, mean_confidence_1, mean_confidence_2, overall_mean = preprocess_and_calculate_mean(df1)

print(f"\nMean Confidence_1: {mean_confidence_1}")
print(f"Mean Confidence_2: {mean_confidence_2}")
print(f"Overall Mean Confidence: {overall_mean}")


Mean Confidence_1: 4.0
Mean Confidence_2: 4.0
Overall Mean Confidence: 4.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['value'] = filtered_df['value'].replace(recode_map)


#### Annotator A - not assisted

In [185]:
# df1 = ...  # your original DataFrame
processed_df, mean_confidence_1, mean_confidence_2, overall_mean = preprocess_and_calculate_mean(df1_1)

print(f"\nMean Confidence_topic: {mean_confidence_1}")
print(f"Mean Confidence_event: {mean_confidence_2}")
print(f"Overall Mean Confidence: {overall_mean}")


Mean Confidence_topic: 3.95959595959596
Mean Confidence_event: 4.0
Overall Mean Confidence: 3.9797979797979797


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['value'] = filtered_df['value'].replace(recode_map)


#### Annotator B - assisted

In [186]:
processed_df, mean_confidence_1, mean_confidence_2, overall_mean = preprocess_and_calculate_mean(df2)
print(f"\nMean Confidence_topic: {mean_confidence_1}")
print(f"Mean Confidence_event: {mean_confidence_2}")
print(f"Overall Mean Confidence: {overall_mean}")


Mean Confidence_topic: 3.9515151515151516
Mean Confidence_event: 3.9353535353535354
Overall Mean Confidence: 3.9434343434343435


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['value'] = filtered_df['value'].replace(recode_map)


#### Annotator B - unassisted

In [187]:
processed_df, mean_confidence_1, mean_confidence_2, overall_mean = preprocess_and_calculate_mean(df2_1)

print(f"\nMean Confidence_topic: {mean_confidence_1}")
print(f"Mean Confidence_event: {mean_confidence_2}")
print(f"Overall Mean Confidence: {overall_mean}")


Mean Confidence_topic: 3.95959595959596
Mean Confidence_event: 3.9575757575757575
Overall Mean Confidence: 3.9585858585858587


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['value'] = filtered_df['value'].replace(recode_map)


#### Annotator C - assisted

In [188]:
processed_df, mean_confidence_1, mean_confidence_2, overall_mean = preprocess_and_calculate_mean(df3)
print(f"\nMean Confidence_topic: {mean_confidence_1}")
print(f"Mean Confidence_event: {mean_confidence_2}")
print(f"Overall Mean Confidence: {overall_mean}")


Mean Confidence_topic: 3.9575757575757575
Mean Confidence_event: 3.98989898989899
Overall Mean Confidence: 3.973737373737374


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['value'] = filtered_df['value'].replace(recode_map)


#### Annotator C - unassisted

In [189]:
processed_df, mean_confidence_1, mean_confidence_2, overall_mean = preprocess_and_calculate_mean(df3_1)

print(f"\nMean Confidence_topic: {mean_confidence_1}")
print(f"Mean Confidence_event: {mean_confidence_2}")
print(f"Overall Mean Confidence: {overall_mean}")


Mean Confidence_topic: 3.9515151515151516
Mean Confidence_event: 3.9474747474747476
Overall Mean Confidence: 3.9494949494949494


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['value'] = filtered_df['value'].replace(recode_map)


### 3. Annotation quality

#### ICR for unassisted (Krippendorff's alpha)

In [343]:
# Pivot the DataFrame to calculate Krippendorff's alpha
unassisted_df = df4.pivot(index=['coder_id', 'coder', 'jobset', 'unit_id', 'unit_status'],
                     columns='variable', values='value').reset_index()

unassisted_df = unassisted_df.rename_axis(None, axis=1)

# If there are NaN values in the pivoted DataFrame, you can replace them with a default value like this
unassisted_df = unassisted_df.fillna('N/A')

In [492]:
unassisted_df.to_csv('../unassisted_pivoted.csv')

In [344]:
#calculate alpha before any removal of flagged cases
alpha_topic = simpledorff.calculate_krippendorffs_alpha_for_df(unassisted_df,
                                                        experiment_col='unit_id',
                                                        annotator_col='coder_id',
                                                        class_col='topic_match')
alpha_topic

0.8327855329679441

In [345]:
#calculate alpha before any removal of flagged cases
alpha_event = simpledorff.calculate_krippendorffs_alpha_for_df(unassisted_df,
                                                        experiment_col='unit_id',
                                                        annotator_col='coder_id',
                                                        class_col='event_match')
alpha_event

0.5984078839063223

#### ICR for assisted (Krippendorff's alpha)

In [498]:
# Assuming df1, df2, and df3 are your DataFrames
df5 = pd.concat([df1, df2, df3])
df5.to_csv('../assisted.csv')

In [340]:
# Pivot the DataFrame to calculate Krippendorff's alpha
assisted_df = df5.pivot(index=['coder_id', 'coder', 'jobset', 'unit_id', 'unit_status'],
                     columns='variable', values='value').reset_index()

assisted_df = assisted_df.rename_axis(None, axis=1)

# If there are NaN values in the pivoted DataFrame, you can replace them with a default value like this
assisted_df = assisted_df.fillna('N/A')

In [493]:
assisted_df.to_csv('../assisted_pivoted.csv')

In [341]:
#calculate alpha before any removal of flagged cases
alpha_topic = simpledorff.calculate_krippendorffs_alpha_for_df(assisted_df,
                                                        experiment_col='unit_id',
                                                        annotator_col='coder_id',
                                                        class_col='topic_match')
alpha_topic

0.7991604967061156

In [342]:
#calculate alpha before any removal of flagged cases
alpha_event = simpledorff.calculate_krippendorffs_alpha_for_df(assisted_df,
                                                        experiment_col='unit_id',
                                                        annotator_col='coder_id',
                                                        class_col='event_match')
alpha_event

0.6374721189591078

It seems that ICR is better for topic match when unassisted, but worse for event match when unassisted compared to the assisted version which scores  sligthly below the ICR of topic match and slightly above event match.

What can we tell based on this? coders seem to perform similarly regardless of assistance it appears that the difference is more pronounced for the event-match level where assistance seems to help annotators. 

shall I do some kind of regression to see the type of model's influence on the results?

### 4. Alignment between human annotation and LLM decision making

In [335]:
#read in LLM-annotated files

d1 = pd.read_csv('../7b_assisted.csv')
d2 = pd.read_csv('../13b_assisted.csv')

In [337]:
d1['unit_id'] = d1['ID1'].astype(str) + '_' + d1['ID2'].astype(str)
# Rename the 'ID1' column to 'new_column_name'
d1.rename(columns={'Topic_match': 'topic_match', 'Event_match' : 'event_match'}, inplace=True)
d1['coder_id'] = '7B-chat'
d1['coder_id'] = d1['coder_id'].astype(str)

In [338]:
# Merge ID1 and ID2 into a new column unit_id
d2['unit_id'] = d2['ID1'].astype(str) + '_' + d2['ID2'].astype(str)
# Rename the 'ID1' column to 'new_column_name'
d2.rename(columns={'Topic_match': 'topic_match', 'Event_match' : 'event_match'}, inplace=True)

d2['coder_id'] = '13B-chat'
d2['coder_id'] = d2['coder_id'].astype(str)

In [333]:
# Assuming your data is stored in a DataFrame called 'df'
assisted_df['event_match'] = assisted_df['event_match'].map({'Yes': 1, 'No': 0})
assisted_df['topic_match'] = assisted_df['topic_match'].map({'Yes': 1, 'No': 0})

In [334]:
assisted_df

Unnamed: 0,coder_id,coder,jobset,unit_id,unit_status,confidence_1,confidence_2,event_match,topic_match
0,2606,s_ching@annotator.com,All,3285217_3285337,DONE,++,++,,
1,2606,s_ching@annotator.com,All,3285217_3286175,DONE,++,++,,
2,2606,s_ching@annotator.com,All,3285217_3287465,DONE,++,++,,
3,2606,s_ching@annotator.com,All,3285217_6290567,DONE,++,++,,
4,2606,s_ching@annotator.com,All,3285217_6290581,DONE,++,++,,
...,...,...,...,...,...,...,...,...,...
1480,2608,floris_v@annotator.com,All,6290557_6290579,DONE,++,++,,
1481,2608,floris_v@annotator.com,All,6290567_6290556,DONE,++,++,,
1482,2608,floris_v@annotator.com,All,6290568_3287414,DONE,++,++,,
1483,2608,floris_v@annotator.com,All,6290568_3287485,DONE,++,++,,


In [319]:
grouped_dfs = dict(tuple(assisted_df.groupby('coder_id')))

# Accessing individual DataFrames based on coder_id
coder_id_2606 = grouped_dfs[2606]
coder_id_2607 = grouped_dfs[2607]
coder_id_2608 = grouped_dfs[2608]

#### For annotator A and LlaMA 7B-chat

In [320]:
#check agreement between coder_id_2608 and Llama 7B
coder_id_2608['coder_id'] = coder_id_2608['coder_id'].astype(str)

In [321]:
# Merge ID1 and ID2 into a new column unit_id
d1['unit_id'] = d1['ID1'].astype(str) + '_' + d1['ID2'].astype(str)
# Rename the 'ID1' column to 'new_column_name'
d1.rename(columns={'Topic_match': 'topic_match', 'Event_match' : 'event_match'}, inplace=True)
d1
d1['coder_id'] = '7B-chat'
d1['coder_id'] = d1['coder_id'].astype(str)

In [322]:
# Concatenate the two dataframes vertically
combined_df = pd.concat([d1, coder_id_2608], ignore_index=True)

# Calculate Krippendorff's alpha for the combined dataframe
alpha_topic = simpledorff.calculate_krippendorffs_alpha_for_df(combined_df,
                                                               experiment_col='unit_id',
                                                               annotator_col='coder_id',
                                                               class_col='topic_match')

In [323]:
alpha_topic

0.290184906177236

In [None]:
# Concatenate the two dataframes vertically
combined_df = pd.concat([d1, coder_id_2608], ignore_index=True)

# Calculate Krippendorff's alpha for the combined dataframe
alpha_event = simpledorff.calculate_krippendorffs_alpha_for_df(combined_df,
                                                               experiment_col='unit_id',
                                                               annotator_col='coder_id',
                                                               class_col='event_match')
alpha_event

#### For annotator B and LlaMA 13B-chat

In [272]:
coder_id_2606 

Unnamed: 0,coder_id,coder,jobset,unit_id,unit_status,confidence_1,confidence_2,event_match,topic_match
0,2606,s_ching@annotator.com,All,3285217_3285337,DONE,++,++,0,1
1,2606,s_ching@annotator.com,All,3285217_3286175,DONE,++,++,1,1
2,2606,s_ching@annotator.com,All,3285217_3287465,DONE,++,++,1,1
3,2606,s_ching@annotator.com,All,3285217_6290567,DONE,++,++,0,1
4,2606,s_ching@annotator.com,All,3285217_6290581,DONE,++,++,1,1
...,...,...,...,...,...,...,...,...,...
490,2606,s_ching@annotator.com,All,6290557_6290579,DONE,++,++,0,1
491,2606,s_ching@annotator.com,All,6290567_6290556,DONE,++,++,0,1
492,2606,s_ching@annotator.com,All,6290568_3287414,DONE,++,++,0,1
493,2606,s_ching@annotator.com,All,6290568_3287485,DONE,++,++,1,1


In [273]:
# Merge ID1 and ID2 into a new column unit_id
d2['unit_id'] = d2['ID1'].astype(str) + '_' + d2['ID2'].astype(str)
# Rename the 'ID1' column to 'new_column_name'
d2.rename(columns={'Topic_match': 'topic_match', 'Event_match' : 'event_match'}, inplace=True)

d2['coder_id'] = '13B-chat'
d2['coder_id'] = d2['coder_id'].astype(str)

In [274]:
#check agreement between coder_id_2608 and Llama 7B
coder_id_2606['coder_id'] = coder_id_2606['coder_id'].astype(str)


In [276]:
# Concatenate the two dataframes vertically
combined_df = pd.concat([d2, coder_id_2606], ignore_index=True)

# Calculate Krippendorff's alpha for the combined dataframe
alpha_topic = simpledorff.calculate_krippendorffs_alpha_for_df(combined_df,
                                                               experiment_col='unit_id',
                                                               annotator_col='coder_id',
                                                               class_col='topic_match')
alpha_topic

0.2160486616380749

In [277]:
# Concatenate the two dataframes vertically
combined_df = pd.concat([d2, coder_id_2606], ignore_index=True)

# Calculate Krippendorff's alpha for the combined dataframe
alpha_event = simpledorff.calculate_krippendorffs_alpha_for_df(combined_df,
                                                               experiment_col='unit_id',
                                                               annotator_col='coder_id',
                                                               class_col='event_match')
alpha_event

0.09721311776706343

#### For LlaMA 7B-chat and LlaMA 13B-chat

In [339]:
# Concatenate the two dataframes vertically
combined_df = pd.concat([d1, d2], ignore_index=True)

# Calculate Krippendorff's alpha for the combined dataframe
alpha_topic = simpledorff.calculate_krippendorffs_alpha_for_df(combined_df,
                                                               experiment_col='unit_id',
                                                               annotator_col='coder_id',
                                                               class_col='topic_match')
alpha_topic

0.09455450983264357

In [285]:
# Concatenate the two dataframes vertically
combined_df = pd.concat([d1, d2], ignore_index=True)

# Calculate Krippendorff's alpha for the combined dataframe
alpha_event = simpledorff.calculate_krippendorffs_alpha_for_df(combined_df,
                                                               experiment_col='unit_id',
                                                               annotator_col='coder_id',
                                                               class_col='event_match')
alpha_event

0.3453632876712329

### 5. Expert annotator makes final decision on mismatched cases

In [286]:
#take dataset with assisted and unassisted tasks 
#find all mismatched cases for both sets and make final decision on them - note down final code

In [353]:
disagreements_assisted = pd.crosstab(index=[assisted_df['unit_id'], assisted_df['coder_id']], columns=assisted_df['topic_match'], margins=True)
disagreements_assisted.to_excel('../dis_assisted.xlsx')

In [393]:
disagreements_assisted 

Unnamed: 0_level_0,topic_match,No,Yes,All
unit_id,coder_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3285217_3285337,2606,0,1,1
3285217_3285337,2607,0,1,1
3285217_3285337,2608,0,1,1
3285217_3286175,2606,0,1,1
3285217_3286175,2607,0,1,1
...,...,...,...,...
6290568_3287485,2608,0,1,1
6290579_6290505,2606,0,1,1
6290579_6290505,2607,0,1,1
6290579_6290505,2608,0,1,1


In [354]:
disagreements_unassisted = pd.crosstab(index=[unassisted_df['unit_id'], unassisted_df['coder_id']], columns=unassisted_df['topic_match'], margins=True)
disagreements_unassisted.to_excel('../dis_unassisted.xlsx')

In [351]:
disagreements_unassisted

Unnamed: 0_level_0,topic_match,No,Yes,All
unit_id,coder_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3285217_3285335,2606,1,0,1
3285217_3285335,2607,1,0,1
3285217_3285335,2608,1,0,1
3285217_3287448,2606,0,1,1
3285217_3287448,2607,0,1,1
...,...,...,...,...
6290574_6290556,2608,0,1,1
6290579_3287398,2606,0,1,1
6290579_3287398,2607,0,1,1
6290579_3287398,2608,0,1,1


In [430]:
def find_disagreements(unassisted_df):
    # Group by unit_id and count unique values for event_match and topic_match
    counts = unassisted_df.groupby('unit_id').agg({'event_match': 'nunique', 'topic_match': 'nunique'})

    # Filter disagreements based on event_match and topic_match
    disagreements_ua_event = list(counts[counts['event_match'] > 1].index)
    disagreements_ua_topic = list(counts[counts['topic_match'] > 1].index)

    # Return the results
    return disagreements_ua_event, disagreements_ua_topic


event_disagreements_ua, topic_disagreements_ua = find_disagreements(unassisted_df)

print("Unit IDs where annotators disagreed on event_match:", event_disagreements_ua)
print("Number of unit IDs where annotators disagreed on event_match:", len(event_disagreements_ua))
print("Unit IDs where annotators disagreed on topic_match:", topic_disagreements_ua)
print("Number of unit IDs where annotators disagreed on topic_match:", len(topic_disagreements_ua))


Unit IDs where annotators disagreed on event_match: ['3285322_3287397', '3285325_3287414', '3285326_3285227', '3285625_3287399', '3285634_3285375', '3286377_3287414', '3287459_3287398', '3287459_6290507', '3287467_3285217', '3287468_3285227', '3287471_3285217', '3287471_3285627', '3287471_3285661', '3287472_3286375', '3287472_3287471', '3287472_3290571', '3287472_3290653', '3287473_3290604', '3287474_3285226', '3287475_3287471', '3287477_3287397', '3287477_3287414', '3287477_6290581', '3287479_3287467', '3287479_3287470', '3287480_3287470', '3287500_3285635', '3287516_3287477', '3287521_3287479', '3287521_3290622', '3287527_3287472', '3287536_3287500', '3287536_3288493', '3287578_3285375', '3287578_3286171', '3290580_3287448', '3290593_3290566', '3290593_3290653', '3290593_6290574', '3290596_3285325', '3290606_3290566', '3290621_6290555', '3290622_3285325', '3290622_6290568', '3290653_6290507', '3290705_3286175', '3290705_3290593', '6290579_3287398']
Number of unit IDs where annotators

In [432]:
event_disagreements_a, topic_disagreements_a = find_disagreements(assisted_df)
print("Unit IDs where annotators disagreed on event_match:", event_disagreements_a)
print("Number of unit IDs where annotators disagreed on event_match:", len(event_disagreements_a))
print("Unit IDs where annotators disagreed on topic_match:", topic_disagreements_a)
print("Number of unit IDs where annotators disagreed on topic_match:", len(topic_disagreements_a))


Unit IDs where annotators disagreed on event_match: ['3285217_6290581', '3285322_3286175', '3285323_6290568', '3285323_6290579', '3285326_3285375', '3285326_3285636', '3285634_6290523', '3286373_6290505', '3287470_3286188', '3287471_3290656', '3287471_6290505', '3287472_3286174', '3287473_3287446', '3287473_6290574', '3287474_3287399', '3287474_3287459', '3287474_3290571', '3287475_3287470', '3287476_3287471', '3287477_3286373', '3287479_3287477', '3287516_3285226', '3287516_3287397', '3287516_3290656', '3287521_3287465', '3287527_3286375', '3287527_3287475', '3287527_3290593', '3287532_3285335', '3287577_3290580', '3287578_3287397', '3287580_3286171', '3288493_3287500', '3290580_3290622', '3290596_3286174', '3290596_3290621', '3290606_3287398', '3290606_3290653', '3290621_3290566', '3290621_3290622', '3290621_6290579', '3290622_3287446', '3290622_3287458', '3290697_3287522', '3290705_6290581', '6290568_3287485']
Number of unit IDs where annotators disagreed on event_match: 46
Unit IDs

### Merge unassisted set with original data and then ammend disagreements

In [462]:
#read in dataframe 
import os
# Function to navigate up 'n' levels
def navigate_up(current_directory, levels):
    for _ in range(levels):
        current_directory = os.path.dirname(current_directory)
    return current_directory

# Get the current working directory
current_directory = os.getcwd()

# Specify the number of levels to navigate up (4 levels in this case)
levels_to_navigate = 4

# Navigate up 'levels_to_navigate' folders
parent_directory = navigate_up(current_directory, levels_to_navigate)

In [463]:
# Define the path to the data file
file_path = os.path.join(parent_directory, 'newspaper_data', 'final_1.csv')
du=pd.read_csv(file_path)

In [464]:
# Merge ID1 and ID2 into a new column unit_id
du['unit_id'] = du['ID1'].astype(str) + '_' + du['ID2'].astype(str)
# Rename the 'ID1' column to 'new_column_name'

In [465]:
#using the mismatched unit_ids filter the dataset 
topic_du_mis = du[du['unit_id'].isin(topic_disagreements_ua)]
event_du_mis = du[du['unit_id'].isin(event_disagreements_ua)]

In [473]:
topic_du_mis.to_csv('../topic_ua_mis.csv')
event_du_mis.to_csv('../event_ua_mis.csv')

In [467]:
#merge unasisted with unassited coded

# Merge df1 with df2 on 'unit_id'
#This contains also agreed and disagreed cases
merged_ua = pd.merge(du, unassisted_df, on='unit_id')

In [583]:
# Drop multiple columns
columns_to_drop = ['ID1', 'ID2','coder','jobset','unit_status', 'confidence_1', 'confidence_2']
merged_ua = merged_ua.drop(columns=columns_to_drop)
merged_ua.to_csv('../final_unassisted_merged.csv')

### For assisted

In [568]:
# Define the path to the data file
file_path = os.path.join(parent_directory, 'newspaper_data', 'final_2.csv')
da=pd.read_csv(file_path)

In [569]:
# Merge ID1 and ID2 into a new column unit_id
da['unit_id'] = da['ID1'].astype(str) + '_' + da['ID2'].astype(str)
# Rename the 'ID1' column to 'new_column_name'

In [570]:
#using the mismatched unit_ids filter the dataset 
topic_da_mis = da[da['unit_id'].isin(topic_disagreements_a)]
event_da_mis = da[da['unit_id'].isin(event_disagreements_a)]

topic_da_mis.to_csv('../topic_a_mis.csv')
event_da_mis.to_csv('../event_a_mis.csv')

In [571]:
#merge unasisted with unassited coded

# Merge df1 with df2 on 'unit_id'
#This contains also agreed and disagreed cases
merged_a = pd.merge(da, assisted_df, on='unit_id')


In [572]:
# Drop multiple columns
columns_to_drop = ['ID1', 'ID2','coder','jobset','unit_status', 'confidence_1', 'confidence_2']
merged_a = merged_a.drop(columns=columns_to_drop)
merged_a.to_csv('../final_assisted_merged.csv')

### Create final validated dataset

In [596]:
def merge_with_expert(df1_path, df2_path, df3_path):
    # Read in the DataFrames
    df1 = pd.read_csv(df1_path)
    df2 = pd.read_csv(df2_path)
    df3 = pd.read_csv(df3_path)

    # Pivot df2
    df2_piv = df2.pivot_table(index='unit_id',
                              columns='variable',
                              values='value',
                              aggfunc='first').reset_index()

    df2_piv.fillna(0, inplace=True)

    # Pivot df3
    df3_piv = df3.pivot_table(index='unit_id',
                              columns='variable',
                              values='value',
                              aggfunc='first').reset_index()

    df3_piv.fillna(0, inplace=True)

    # Merge the three DataFrames on 'unit_id'
    merged_df = pd.merge(df1, df2_piv[['unit_id', 'topic_match']], how='left', on='unit_id', suffixes=('_df1', '_df2'))
    merged_df = pd.merge(merged_df, df3_piv[['unit_id', 'event_match']], how='left', on='unit_id', suffixes=('_merged', '_df3'))

    # Replace 'topic_match' in df1 with the values from df2 where they exist
    merged_df['topic_match'] = merged_df['topic_match_df2'].combine_first(merged_df['topic_match_df1'])

    # Replace 'event_match' in the merged_df with the values from df3 where they exist
    merged_df['event_match'] = merged_df['event_match_df3'].combine_first(merged_df['event_match_merged'])

    # Drop the temporary columns used for merging
    merged_df = merged_df.drop(['topic_match_df1','topic_match_df2', 'event_match_df3', 'event_match_merged'], axis=1)

    #add function to retain only the codings of coder 2606 any coder would do as now the annotations are aligned
    # Add function to retain only the codings of coder 2606
    one_coder = merged_df[merged_df['coder_id'] == 2606].copy()

    return one_coder

# Example usage
df1_path = '../final_assisted_merged.csv'
df2_path = '../annotations_185_expert_topic_a.csv.csv'
df3_path = '../annotations_186_expert_event_a.csv.csv'

assisted_final = merge_with_expert(df1_path, df2_path, df3_path)

In [597]:
df1_path = '../final_unassisted_merged.csv'
df2_path = '../annotations_184_expert_topic_ua.csv.csv'
df3_path = '../annotations_187_expert_event_ua.csv.csv'
unassisted_final = merge_with_expert(df1_path, df2_path, df3_path)

In [600]:
final_validated = pd.concat([assisted_final, unassisted_final])

In [None]:
final_validated.to_csv('../full_validated.csv')

### Get embeddings for the texts 

In [28]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import torch

In [10]:
final_validated = pd.read_csv('../full_validated.csv')

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Similarity_Score,Text1,Text2,Group,Date1,Date2,Publisher1,Publisher2,proper_nouns1,proper_nouns2,keywords1,keywords2,unit_id,coder_id,topic_match,event_match
0,0,0,0.646155,Geweld bij antilockdowndemo in Dublin; Bij een...,Esther Ouwehand had niets met de plek waar ze ...,medium,2021-02-27 20:30:26,2021-03-01 00:00:00,NOS liveblog,Algemeen Dagblad,"Geweld, Dublin, Leo Varadkar","Esther Ouwehand, Vinex, Esther Ouwehand, Uranu...","['ongeregeldheden', 'antilockdowndemo', 'wapen...","['hoornespolder', 'neptunus', 'rijtjeswoningen...",3287531_3286186,2606,Yes,No
1,3,3,0.760930,Stelling 5: gevaccineerde burgers moeten als e...,Sportscholen demonstratief open: 'Bewegen is n...,high,2021-02-28 23:18:22,2021-03-01 00:00:00,NOS liveblog,Trouw,"Wilders, Marijnissen, Marijnissen, Klaver van ...","Tino Hoogendijk Sport, J","['sneltesten', 'gevaccineerden', 'gevaccineerd...","['instructeur', 'hometrainers', 'housebeat', '...",3287467_3285332,2606,No,No
2,6,6,0.820325,Vraag van de eigenaresse van een couscousbar a...,De uitzending van 1 maart: Gasvrij duurder dan...,high,2021-02-28 22:43:49,2021-03-01 12:06:47,NOS liveblog,Nieuwsuur,"Wilders, Nadia, Wilders, Wilders bestrijdt, Na...","Friese Garijp, Noord-Holland Noord, Jan Nieuwe...","['onwenselijk', 'couscousbar', 'couscous', 'ne...","['nieuwenburg', 'gasvrij', 'besmettingen', 'aa...",3287471_6290579,2606,Yes,No
3,9,9,0.774437,Stelling 4: de rekening van de coronacrisis mo...,Helft potentiële Forumstemmers vermoedt wereld...,high,2021-02-28 22:48:57,2021-03-01 00:00:00,NOS liveblog,De Volkskrant,"Marijnissen, CDA, VVD","Forum voor Democratie, Ipsos, Baudet, Urk","['inkomensongelijkheid', 'marijnissen', 'belas...","['ipsos', 'forumstemmers', 'kiesgerechtigden',...",3287470_3285229,2606,Yes,No
4,12,12,0.731456,Stelling 2: minimaal 10 procent van de bewind...,CPB-doorrekening: risico op maken van uitglije...,high,2021-02-28 22:03:24,2021-03-01 00:00:00,NOS liveblog,Het Financieele Dagblad,"GroenLinks, SP, D66, Wilders, Zwarte Piet, Fri...","Is de invloed, CPB, Jean Dohmen, Centraal Plan...","['ronald', 'hilariteit', 'zit', 'westerse', 'm...","['doorrekeningen', 'uitglijer', 'berekeningen'...",3287477_3290566,2606,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,1470,1470,0.631112,Jozef Deleu: ‘Iets onder woorden brengen gaat ...,"Pittig RTL-debat over klimaat, discriminatie e...",medium,2021-03-01 00:00:00,2021-03-01 01:19:49,NRC Handelsblad,NOS nieuws,"Jozef Deleu, Jozef Deleu, Nederland zelfzuchti...","Wilders, Zwarte Piet, VVD, Wilders, Marijnisse...","['vlaams', 'zelfzuchtiger', 'bejaard', 'rekkem...","['gevaccineerden', 'gamechanger', 'coronamaatr...",3285657_3287465,2606,No,No
986,1473,1473,0.758489,Stelling 2: minimaal 10 procent van de bewind...,"Partijen willen hogere lasten voor bedrijven, ...",high,2021-02-28 22:03:24,2021-03-01 10:30:10,NOS liveblog,NOS nieuws,"GroenLinks, SP, D66, Wilders, Zwarte Piet, Fri...","CPB, SGP, CPB, VVD, CDA, D66, GroenLinks, SP, ...","['ronald', 'hilariteit', 'zit', 'westerse', 'm...","['lastenverzwaring', 'doorrekeningen', 'verkie...",3287477_6290557,2606,Yes,No
987,1476,1476,0.756344,Naast groen en digitaal nu ook sociaal; Deze w...,Kaag (D66) wil meer vrijheden toestaan met vac...,high,2021-03-01 00:00:00,2021-03-01 00:00:00,Het Financieele Dagblad,Algemeen Dagblad,"Europese Commissie, Ursula von der Leyen, Euro...","D66, Laten, Isra, Europese Commissie","['brexitreferendum', 'baanzekerheid', 'schiffe...","['gevaccineerde', 'vaccinatiebewijs', 'zelftes...",3290586_3286178,2606,Yes,No
988,1479,1479,0.788596,"Een sterkere EU graag, maar nu is dat even bij...",Digitale campagne wordt platter én dieper; Ver...,high,2021-03-01 00:00:00,2021-03-01 00:00:00,Trouw,Trouw,"PVV, D66, VVD","VVD, SP","['bikkelharde', 'mondkapjes', 'europese', 'opr...","['labradorpuppy', 'internethumor', 'beeldgrapp...",3285325_3285375,2606,Yes,No


In [18]:

model_name = 'GroNLP/bert-base-dutch-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Text1 = final_validated['Text1'].to_list()
Text2 = final_validated['Text2'].to_list()

Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:

#function to get embeddings with BERTje
def get_embeddings(corpus):
    embeddings = []
    total_articles = len(corpus)

    for article in tqdm(corpus, desc='Computing embeddings', ncols=80):
        inputs = tokenizer(str(article), return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.pooler_output.squeeze().numpy())

    return embeddings
text1_embeddings = get_embeddings(Text1)

Computing embeddings: 100%|███████████████████| 990/990 [00:42<00:00, 23.36it/s]


In [30]:
text2_embeddings = get_embeddings(Text2)

Computing embeddings: 100%|███████████████████| 990/990 [00:43<00:00, 22.83it/s]


In [31]:
# Add the embeddings to the final_validated DataFrame
final_validated['Text1_Embeddings'] = text1_embeddings
final_validated['Text2_Embeddings'] = text2_embeddings

In [33]:
final_validated.to_csv('../final_validated_with_embeddings.csv', index = False)

In [35]:
# Specify the columns to keep in the new DataFrame
columns_to_keep = ['unit_id', 'Date1', 'Date2', 'Text1_Embeddings', 'Text2_Embeddings', 'topic_match', 'event_match', 'Similarity_Score', 'Publisher1', 'Publisher2']  # Replace with your column names

# Create a new DataFrame with only the specified columns
gold_standard = final_validated[columns_to_keep].copy()

In [37]:
gold_standard.to_csv('../gold_standard_inf_flows.csv')