In [3]:
import json
import ast

import numpy as np
import pandas as pd
from QualtricsAPI.Setup import Credentials
from QualtricsAPI.Survey import Responses

# Data for studies 1 and 2

These data have been processed in a notebook similar to this, and will now be combined with the data from study 3.

In [4]:
df = pd.read_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\data\IIH_all_data.csv",
    index_col="Unnamed: 0",
)

# all these raters were humanities
df["Field_group"] = "Humanities"

# remove the test sets from the df
# we did not record the data, so they are just NaN
# there are 30, i.e. 2*3 sets (one for each group)
df = df.loc[~df.binary.isna()]

# make column names lower case
df.columns = [i.lower() for i in df.columns]
df = df.rename(columns={"b3_l": "deliverable"})

# add field group columns
df["field_group"] = "Humanities"


pd.set_option("display.max_columns", None)
df.head(2)

Unnamed: 0,block,field,rater,binary,abstract,ordinal,rater_disc,pubyear,doi,doctype,group,chauvinism,capture,citation,socialmedia,mention,outlier,usage,present,intolerance,ethics,empirical,environment,education,wellbeing,deliverable,abstract_text,title,abstract_length,abstract_wordcount,field_group
0,2,HIST,521.0,0.0,QID2_1,0.0,LING,2015.0,10.1080/09612025.2015.1028209,Article,group1,0.0,30.0,3.0,7.0,0.0,0.0,1021.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,This article examines the content of Women's C...,'Our own paper': evaluating the impact of Wome...,757,115,Humanities
1,2,HIST,142.0,0.0,QID2_1,0.0,PHIL,2015.0,10.1080/09612025.2015.1028209,Article,group1,0.0,30.0,3.0,7.0,0.0,0.0,1021.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,This article examines the content of Women's C...,'Our own paper': evaluating the impact of Wome...,757,115,Humanities


# Document data

The abstracts, their doi, title, document type and publication year. These were downloaded from WoS.

In [5]:
# load the abstracts for group3
documents = (
    pd.read_csv(
        r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\IIH_followupdata_version3.csv",
        sep=";",
    )
    .reset_index()
    .rename(columns={"index": "ORIGINAL_INDEX"})
)
# keep relevant columns
documents = documents[
    ["ORIGINAL_INDEX", "DOI", "TITLE", "ABSTRACT", "PUBYEAR", "DOCTYPE"]
]
documents.columns = [i.lower() for i in documents.columns]
documents = documents.rename(columns={"abstract": "abstract_text"})

# Rater data

Rater codes as well as demographic information and their fields. We used codes rather than names to protect the identity of these raters. Only the member of the research team who managed the hiring (LL) knew the names.

In [6]:
# rater data for study 3
file_path = r"C:\Users\conix\Dropbox\aWriting\humanities impact\data"

with open(file_path + "\\rater_data.txt", "r") as file:
    rater_data = json.load(file)

# gender and weird for studies 1 and 2 (collected later)
# some uncertainty about 321, 342, 411, 421, 511, 521

with open(file_path + "\\pilot_gender.txt", "r") as file:
    pilot_gender = file.read()
pilot_gender = ast.literal_eval(pilot_gender)

with open(file_path + "\\pilot_weird.txt", "r") as file:
    pilot_weird = file.read()
pilot_weird = ast.literal_eval(pilot_weird)

df["sex"] = df["rater"].replace(pilot_gender)
df["weird"] = df["rater"].replace(pilot_weird)

# Coding data

These are the topic codes for all the abstracts. These were coded by two members of the research team, then compared and discussed until consensus.

In [7]:
topics_study3 = pd.read_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\abstract_coding\IIH_codingStudy2_DisagreementsExcel_corrected.csv",
    sep=";",
)
topics_studies12 = pd.read_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\abstract_coding\final_codes.csv",
    sep=";",
)

# check if there is no disagreement left between the two coders for study 3
code_cols = [
    "A1 time",
    "A2 -ism",
    "A3 ethics",
    "A4 emp",
    "A5 world",
    "A6 non-lit",
    "B1 edu",
    "B2 human",
    "B3 deliv",
]

for i in code_cols:
    assert (
        topics_study3[i + "_LIN"] == topics_study3[i + "_OLI"]
    ).sum() == len(topics_study3), (
        f"col {i} has disagreements still"
    )

# if not, then retain the codes of one rater (LIN)
topics_study3 = topics_study3[
    [i for i in topics_study3.columns if 'LIN' in i or 'DOI' in i]
]

# rename the columns
column_names = [
    "doi",
    "present",
    "intolerance",
    "ethics",
    "empirical",
    "environment",
    "fiction",
    "education",
    "wellbeing",
    "deliverable",
]

topics_study3.columns = column_names

# select the correct columns for studies 1 and 2
topics_studies12 = topics_studies12[
    [
        "DOI",
        "Present",
        "Intolerance",
        "Ethics",
        "Empirical",
        "Environment",
        "Fiction",
        "Education",
        "Wellbeing",
        "Deliverable",
    ]
]

#rename them too
topics_studies12.columns = column_names

# there were some duplicates in the sample, drop those, as they have the same codes
topics_studies12 = topics_studies12.drop_duplicates(subset="doi")

# adapt present code for the old data (because recoded after redefinition)
merged = df.merge(
    topics_studies12[["doi", "present"]], on="doi", how="left", suffixes=("", "_new")
)
df["present"] = merged["present_new"].combine_first(df["present"])

# add fiction to the old data (because fiction introduced at study3)
merged = df.merge(topics_studies12[["doi", "fiction"]], on="doi", how="left")
df["fiction"] = merged["fiction"].values

# Survey data

## Load the data and remove what we don"t want

In [10]:
# Load the main study data from qualtrics
# for study 3, raters did the work in three chunks and got paid separately for each chunk.
# Because of that, there are three separate surveys for study 3. We combine the data for these in one df.

# credentials to get data via the qualtrics API
id_s1 = "SV_6g14RQpKZGSbx3g"
id_s2 = "SV_8pGlqv9GqN2OrVI"
id_s3 = "SV_bQ75Bb7jwCEnHBI"

qtoken = "4TJ1WJofe3yHbR8duXWSNCxitHQ6d2QjyKwoZ4oz"

qdc = "fra1"

# import data through API and store in dfs
Credentials().qualtrics_api_credentials(token=qtoken, data_center=qdc)

df1 = Responses().get_survey_responses(survey=id_s1)
df2 = Responses().get_survey_responses(survey=id_s2)
df3 = Responses().get_survey_responses(survey=id_s3)

surveys = [df1, df2, df3]

In [11]:
# save for uploading on zenodo

df1.to_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\data_upload\raw_group3_1_publish.csv"
)
df2.to_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\data_upload\raw_group3_2_publish.csv"
)
df3.to_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\data_upload\raw_group3_3_publish.csv"
)

In [12]:
# columns we don't need to keep
dropcols = [
    "RecordedDate",
    "ResponseId",
    "RecipientLastName",
    "RecipientFirstName",
    "RecipientEmail",
    "ExternalReference",
    "LocationLatitude",
    "LocationLongitude",
    "DistributionChannel",
    "UserLanguage",
    "Status",
    "IPAddress",
    "Finished",
    "Progress",
]

# strore information like when and how long raters worked
survey_info = {"durations": [], "dates": [], "block_times": [], "titles": []}


def clean_surveys(survey):

    # store titles for the abstracts
    pattern = r"as well. - (.*?)\n\n\n\n"
    survey_info["titles"].append(
        [
            survey[col].str.extract(pattern).iloc[0, 0]
            for col in survey.columns
            if (("TEXT" in col) and ("QID" in col))
        ]
    )

    # only finished surveys
    survey = survey.loc[survey.Progress == "100"]

    # drop unneeded columns across surveys
    survey = survey.drop(columns=dropcols)

    # remove the timer columns except for total block time
    timer_cols = [
        i for i in survey.columns if (("First" in i) or ("Last" in i) or ("Count" in i))
    ]
    survey = survey.drop(columns=timer_cols)

    # remove test surveys from research team
    researchers = ["stijn", "lin", "leander", "olivier", "pei-shan"]
    pattern = "|".join(researchers)
    survey = survey[
        ~survey["QID0"]
        .str.replace("[^a-zA-Z]", "", regex=True)
        .str.contains(pattern, case=False, na=False)
    ]

    # store and remove duration
    survey_info["durations"].append(survey[["QID0", "Duration (in seconds)"]])
    survey = survey.drop(columns=["Duration (in seconds)"])

    # store and remove dates
    survey_info["dates"].append(survey[["QID0", "StartDate", "EndDate"]])
    survey = survey.drop(columns=["StartDate", "EndDate"])

    # store and remove block times
    block_times = [i for i in survey.columns if "QT" in i] + ["QID0"]
    survey_info["block_times"].append(survey[block_times])
    survey = survey[[i for i in survey.columns if "QT" not in i]]

    # rename QID0 column to ID
    survey = survey.rename(columns={"QID0": "rater"})

    return survey


df1 = clean_surveys(df1)
df2 = clean_surveys(df2)
df3 = clean_surveys(df3)

In [13]:
# There was one rater thrown out because of inconsistencies
# we remove them

df1 = df1.loc[df1.rater != "211"]

# check if all surveys have the same number of raters
assert (
    len(df1) == len(df2) == len(df3)
), "not all surveys have the same number of raters"

# check if all binary data is indeed binary
for set, i in enumerate([df1, df2, df3]):
    binary_cols = [col for col in i.columns if "TEXT" in col]
    binary_df = i[binary_cols].copy()
    invalid_values_mask = binary_df.map(lambda x: x not in ["0", "1"])
    invalid_locations = np.where(invalid_values_mask)
    positions = list(zip(invalid_locations[0], invalid_locations[1]))
    for pos in positions:
        row, col = pos
        print(
            f"Invalid value found in set {set} at Row: {row}, Column: '{binary_df.columns[col]}' -> Value: {binary_df.iloc[row, col]}"
        )


# fix the value that is problematic
# we assume that the 2 is supposed to be a 1

df1.loc[df1.index[17], "QID5_1_TEXT"] = "1"

Invalid value found in set 0 at Row: 17, Column: 'QID5_1_TEXT' -> Value: 2


## Turn into DF with ratings as rows

In [14]:
# make a df with as the rows ratings, an as the columns the field, rater, block, rank score and binary score
def transform_survey_data(df, set):
    df_final = pd.DataFrame(columns=["rater", "block", "field", "ordinal", "binary"])

    # Iterate through each QID set
    for qid in range(1, 11):  # each set has 10 blocks, we remove the burnin
        for position in range(1, 6):  # five papers per block
            # Extract relevant columns for the current position and task
            rank_col = f"QID{qid}_{position}"
            binary_col = f"{rank_col}_TEXT"
            temp_df = df[["rater", rank_col, binary_col]].copy()
            temp_df.rename(
                columns={rank_col: "ordinal", binary_col: "binary"}, inplace=True
            )
            temp_df["block"] = f"QID{qid}_{set}"
            temp_df["field"] = position
            # Append to the final DataFrame
            df_final = pd.concat([df_final, temp_df], ignore_index=True)

            # add a column to indicate that these data are not pilot data
            df_final["group"] = "main"

    return df_final


df1 = transform_survey_data(df1, 1)
df2 = transform_survey_data(df2, 2)
df3 = transform_survey_data(df3, 3)

In [15]:
# add the titles of the abstracts to each row

n_raters = len(df1.rater.unique())

df1["title"] = np.repeat(
    np.array(survey_info["titles"])[0], n_raters  # get the stored titles
)  # tile them n_raters times
df2["title"] = np.repeat(
    np.array(survey_info["titles"])[1], n_raters  # get the stored titles
)  # tile them n_raters times
df3["title"] = np.repeat(
    np.array(survey_info["titles"])[2], n_raters  # get the stored titles
)  # tile them n_raters times

# Combine with other data

In [16]:
# first, combine all three sets, such that we have all data for study 3 combined in one df
main_study = pd.concat([df1, df2, df3], ignore_index=True)

# add rater data
def map_values(row):
    # Extract ID from the row
    id_value = row["rater"]
    # Look up the inner dictionary using this ID
    inner_dict = rater_data.get(id_value, {})
    # For each key in the inner dictionary, set the row"s corresponding column
    for key, value in inner_dict.items():
        row[key] = value
    return row

main_study = main_study.apply(map_values, axis=1)

# add main to rater column and block column
# this way, we avoid confusion with data from studies 1 & 2
main_study["block"] = [f"{i}_main" for i in main_study.block.values]
main_study["rater"] = [f"{i}_main" for i in main_study.rater.values]

# give the abstracts unique names
# use the field position and block
main_study["abstract"] = main_study["field"].replace(
    {
        "History": "1",
        "Philosophy": "2",
        "Religion": "3",
        "Linguistics": "4",
        "Literature": "5",
    }
)
main_study["abstract"] = (
    main_study["block"] + "_" + main_study["abstract"].astype("string")
)

# make ordinal and binary integer
for i in ["binary", "ordinal"]:
    main_study[i] = main_study[i].astype("int32")

# replace 'Theology' by 'Religion' for consistency
main_study["rater_disc"] = main_study["rater_disc"].replace({"Theology": "Religion"})

main_study.head(3)

Unnamed: 0,rater,block,field,ordinal,binary,group,title,rater_disc,Nationality,Sex,WEIRD,Field_group,abstract
0,151_main,QID1_1_main,1,3,0,main,Does China Matter? Taiwan's Successful Bid to ...,History,Argentinian,M,not WEIRD,Humanities,QID1_1_main_1
1,111_main,QID1_1_main,1,4,1,main,Does China Matter? Taiwan's Successful Bid to ...,Philosophy,Serbian,M,not WEIRD,Humanities,QID1_1_main_1
2,121_main,QID1_1_main,1,1,1,main,Does China Matter? Taiwan's Successful Bid to ...,Linguistics,Turkish,M,not WEIRD,Humanities,QID1_1_main_1


In [17]:
# add documents
# one problem in the merging: one of the titles had a character that had been changed
# was in the title: "God as a true Elohim and Savior of the Poor - Psalm 82 in the Corpus of the Psalms of Asaph"
# replace with the string from the survey

documents.loc[documents.title.str.contains("Psalm 82"), "title"] = (
    "God as a true Elohim and Savior of the Poor - Psalm 82 in the Corpus of the Psalms of Asaph"
)

main_study = documents.merge(main_study, on="title", how="right")

In [18]:
# raters were kicked if they had too many inconsistencies. If they had 1, we allowed them to fix it.
# here we implement the fixes

data = {}

# 7 mistakes in total, 1 in each of the following sets
fixes = ["111", "111_set3", "121", "131", "151", "212", "226_set3"]

for i in fixes:
    # read the fixed data
    fixed_data = pd.read_csv(
        r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\raters_fix\\"
        + i
        + "_corrected.csv",
        dtype={"ranks": "int32", "binary": "int32"},
        sep=";",
    )

    # name the columns of the fixed data
    fixed_data.columns = [
        "index_col",
        "title",
        "abstract_text",
        "set",
        "ordinal",
        "binary",
    ]

    # check whether it is set 1 or set 3
    if "_" in i:
        rater = i.split("_")[0]
        block = i.split("set")[1]
    else:
        rater = i
        block = "1"
        fixed_data = fixed_data.loc[~fixed_data.set.isin(["set_a", "set_b", "set_c"])]

    # get the relevant columns
    fixed_data = fixed_data[["title", "ordinal", "binary"]]

    condition = (main_study.rater == rater + "_main") & (
        main_study.block.str.contains("_" + block + "_main")
    )

    # get the relevant data
    main_study_filtered = main_study.copy().loc[condition]

    # combine the fixed and original values in one df for comparison
    merged_df = pd.merge(
        main_study_filtered, fixed_data, on="title", how="left", suffixes=("_old", "")
    )

    # replace the originally recorded value by the fixed value
    for col in ["binary", "ordinal"]:
        main_study.loc[condition, col] = list(merged_df[col].values)

In [19]:
# reverse ordinal data (to make 'more valuable' a higher score, and 'less valuable' a lower score)
# We have to do this because in the survey the most valuable was the document ranked 1, and the least was ranked 5.
main_study["ordinal"] = main_study["ordinal"].astype("int32")
main_study["ordinal"] = main_study["ordinal"].replace({1: 5, 2: 4, 3: 3, 4: 2, 5: 1})

# make it start at 0 rather than 1
main_study["ordinal"] = main_study["ordinal"] - 1

In [20]:
# add field strings
field_codes = {
    1: "History",
    2: "Philosophy",
    3: "Religion",
    4: "Linguistics",
    5: "Literature",
}
main_study["field"] = main_study["field"].replace(field_codes)

# add chauvinism, i.e. rater's field is same as abstract field
main_study["chauvinism"] = main_study.apply(
    lambda row: 1 if row["field"] == row["rater_disc"] else 0, axis=1
)

# make column names lower case
main_study.columns = [i.lower() for i in main_study.columns]

main_study.head(3)

Unnamed: 0,original_index,doi,title,abstract_text,pubyear,doctype,rater,block,field,ordinal,binary,group,rater_disc,nationality,sex,weird,field_group,abstract,chauvinism
0,0,10.1080/09523367.2015.1022721,Does China Matter? Taiwan's Successful Bid to ...,This study seeks to identify and explain the k...,2015,Article,151_main,QID1_1_main,History,2,0,main,History,Argentinian,M,not WEIRD,Humanities,QID1_1_main_1,1
1,0,10.1080/09523367.2015.1022721,Does China Matter? Taiwan's Successful Bid to ...,This study seeks to identify and explain the k...,2015,Article,111_main,QID1_1_main,History,1,1,main,Philosophy,Serbian,M,not WEIRD,Humanities,QID1_1_main_1,0
2,0,10.1080/09523367.2015.1022721,Does China Matter? Taiwan's Successful Bid to ...,This study seeks to identify and explain the k...,2015,Article,121_main,QID1_1_main,History,4,1,main,Linguistics,Turkish,M,not WEIRD,Humanities,QID1_1_main_1,0


In [21]:
# add topic codes
main_study = pd.merge(
    main_study, topics_study3, on="doi", how="left", suffixes=("", "_from_df2")
)

In [19]:
# eventually add altmetrics
# data has been downloaded, but we wait for preregistration

In [23]:
# combine the data for studies 1 and 2 on the one hand, and study 3 (processed up to here) on the other hand
combined_df = pd.concat([main_study, df], ignore_index=True, sort=False)

# make all columns lower case
combined_df.columns = combined_df.columns.str.lower()

print(f"Total number of document ratings: {len(combined_df)}")
combined_df.head(3)

Total number of document ratings: 8820


Unnamed: 0,original_index,doi,title,abstract_text,pubyear,doctype,rater,block,field,ordinal,binary,group,rater_disc,nationality,sex,weird,field_group,abstract,chauvinism,present,intolerance,ethics,empirical,environment,fiction,education,wellbeing,deliverable,capture,citation,socialmedia,mention,outlier,usage,abstract_length,abstract_wordcount
0,0.0,10.1080/09523367.2015.1022721,Does China Matter? Taiwan's Successful Bid to ...,This study seeks to identify and explain the k...,2015.0,Article,151_main,QID1_1_main,History,2.0,0.0,main,History,Argentinian,M,not WEIRD,Humanities,QID1_1_main_1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,
1,0.0,10.1080/09523367.2015.1022721,Does China Matter? Taiwan's Successful Bid to ...,This study seeks to identify and explain the k...,2015.0,Article,111_main,QID1_1_main,History,1.0,1.0,main,Philosophy,Serbian,M,not WEIRD,Humanities,QID1_1_main_1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,
2,0.0,10.1080/09523367.2015.1022721,Does China Matter? Taiwan's Successful Bid to ...,This study seeks to identify and explain the k...,2015.0,Article,121_main,QID1_1_main,History,4.0,1.0,main,Linguistics,Turkish,M,not WEIRD,Humanities,QID1_1_main_1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,


In [24]:
# homogenize field term usage
disc_names = {
    "HIST": "History",
    "PHIL": "Philosophy",
    "LING": "Linguistics",
    "LIT": "Literature",
    "REL": "Religion",
    "Theology": "Religion",
}
combined_df = combined_df.replace(disc_names)

# homogenize rater names
combined_df["rater"] = combined_df["rater"].astype("str")
combined_df["rater"] = [i.split(".")[0] for i in combined_df["rater"].values]

# get abstract length (normalized)  and word count
combined_df["abstract_length"] = [
    len(i) for i in list(combined_df["abstract_text"].values)
]
combined_df["abstract_length_norm"] = (
    combined_df.abstract_length.values - combined_df.abstract_length.values.mean()
) / np.std(combined_df.abstract_length.values)

combined_df["abstract_wordcount"] = [
    len(i.split(" ")) for i in combined_df["abstract_text"].values
]

# add columns indicating whether the rater was from humanities
combined_df["humanities"] = np.where(combined_df.field_group == "Humanities", 1, 0)

# add a rater disc where all non-hum raters are described as that
combined_df["new_rater_disc"] = combined_df.apply(
    lambda row: (
        row["rater_disc"] if row["field_group"] == "Humanities" else "non-humanities"
    ),
    axis=1,
)


# turn columns that can into int
int_cols = [
    "ordinal",
    "binary",
    "pubyear",
    "chauvinism",
    "present",
    "intolerance",
    "ethics",
    "empirical",
    "environment",
    "fiction",
    "education",
    "wellbeing",
    "deliverable",
]
combined_df[int_cols] = combined_df[int_cols].astype("float64")
combined_df[int_cols] = combined_df[int_cols].astype("Int64")

# turn columns that can into categorical
cat_cols = ["sex", "weird", "rater", "field", "new_rater_disc", "abstract", "doctype"]
combined_df[cat_cols] = combined_df[cat_cols].astype("category")

# create df for the main study
# remove unused categories in the categorical columns
df_main = combined_df.loc[combined_df.group == "main"].copy()
for col in cat_cols:
    df_main[col] = df_main[col].cat.remove_unused_categories()

# make ranks start at 1 again
combined_df["ordinal"] = combined_df["ordinal"] + 1

In [27]:
# save data for use in analysis

combined_df.to_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\combined_data.csv"
)
df_main.to_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\df_main.csv"
)

In [28]:
# remove almterics data before uploading the processed data

combined_df2 = combined_df[
    [
        i
        for i in combined_df.columns
        if i
        not in ["capture", "citation", "socialmedia", "usage", "mention", "outlier"]
    ]
]
combined_df2.to_csv(
    r"C:\Users\conix\Dropbox\aWriting\humanities impact\follow up study\data_upload\combined_data_for_upload.csv",
    sep=";",
)