In [154]:
import pandas as pd
from dotenv import load_dotenv
from pymongo import MongoClient
import os
import numpy as np
import datetime as dt
from zoneinfo import ZoneInfo
import json

_ = load_dotenv()

In [213]:
# Read the meta files.
def get_meta(feeds):
    meta = {}

    for feed in feeds:
        meta[feed] = json.load(open(f"meta/{feed}.json"))

    return meta


snapshots = [
    "749fc4f5-ce90-4a98-b5e7-39db62f1632b",
    "a602674f-7bc6-47e7-919b-bcde0dcf5f05",
    "1f9d2527-6430-4c8e-87cf-37a3e14323be",
    "c9dfbf3e-8655-4059-a1ac-fdb73fd88764",
    "cd299601-0ec4-49ab-a410-64489a867dea",
]

meta = get_meta(snapshots)

In [11]:
uri = f"mongodb+srv://{os.getenv('MONGO_USER')}:{os.getenv('MONGO_SECRET')}@responses.vpbn1v3.mongodb.net/?retryWrites=true&w=majority&appName=responses"
client = MongoClient(uri)

In [None]:
responses = client["trending-feeds"]["responses-prod"].find({}).to_list()
sessions = client["trending-feeds"]["session-starts-prod"].find({}).to_list()

In [145]:
def to_central_time(timestamp_str: str) -> dt.datetime:
    # Ensure the timestamp is timezone-aware
    utc_time = dt.datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
    return utc_time.astimezone(ZoneInfo("America/Chicago")).strftime(
        "%Y-%m-%d %I:%M %p"
    )

In [147]:
def convert_seconds(duration):
    minutes, seconds = divmod(duration, 60)
    return f"{round(minutes)}m {round(seconds)}s"


# Get the demographics from each participant.
def get_demographics(responses):

    data = []

    for response in responses:
        data.append(
            {
                "pid": response["prolific"]["PROLIFIC_PID"],
                "age": response["exitAnswers"]["age"],
                "gender": response["exitAnswers"]["gender"],
                "education": response["exitAnswers"]["education"],
                "duration": response["totalDuration"],
                "duration_formatted": convert_seconds(response["totalDuration"]),
                "start_time": to_central_time(response["consentTimestamp"]),
            }
        )

    return pd.DataFrame(data).set_index("pid")


demographics_df = get_demographics(responses)
display(demographics_df.drop(columns=["duration", "duration_formatted"]))

print(f"Duration Statistics (n={len(demographics_df):,}):")
print(
    demographics_df["duration"]
    .describe()
    .apply(convert_seconds)
    .drop("count")
    .to_string(),
    "\n",
)


print(
    f"Men: {demographics_df.query('gender == \"Man\"').shape[0] / len(demographics_df):.2%}, n={demographics_df.query('gender == \"Man\"').shape[0]}"
)
print(
    f"Women: {demographics_df.query('gender == \"Woman\"').shape[0] / len(demographics_df):.2%}, n={demographics_df.query('gender == \"Woman\"').shape[0]}"
)

# Print the education distribution.
print(
    "\n"
    + demographics_df["education"]
    .value_counts()
    .reindex(
        [
            "Less than high school",
            "High school graduate or equivalent (e.g., GED)",
            "Some college, no degree",
            "Associate degree (e.g., AA, AS)",
            "Bachelor's degree (e.g., BA, BS)",
            "Master's degree (e.g., MA, MS, MBA)",
            "Graduate degree (e.g., PhD, MD, JD)",
            "Prefer not to say",
        ]
    )
    .fillna(0)
    .astype(int)
    .rename(None)
    .rename_axis(None)
    .to_string()
)

# Print the age distribution.
print(
    "\n"
    + demographics_df["age"]
    .value_counts()
    .reindex(
        [
            "18-24",
            "25-34",
            "35-44",
            "45-54",
            "55-64",
            "65 or older",
            "Prefer not to say",
        ]
    )
    .fillna(0)
    .astype(int)
    .rename(None)
    .rename_axis(None)
    .to_string()
)

Unnamed: 0_level_0,age,gender,education,start_time
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
66907e71d8f7d4ee88cc59ce,25-34,Man,"Bachelor's degree (e.g., BA, BS)",2025-07-30 04:13 PM
6720f4ced2a11d4e702566fa,45-54,Woman,"Graduate degree (e.g., PhD, MD, JD)",2025-07-30 04:12 PM
656a1cf88a3cf09f538c6f25,55-64,Man,"Bachelor's degree (e.g., BA, BS)",2025-07-30 04:11 PM
676fa93a6e1840db7ea43423,25-34,Woman,"Some college, no degree",2025-07-30 04:18 PM
6779c674931f49f5fe68afe3,25-34,Man,"High school graduate or equivalent (e.g., GED)",2025-07-30 04:18 PM
56e6a66af6ed900006a5867c,45-54,Man,"High school graduate or equivalent (e.g., GED)",2025-07-30 04:16 PM
67884b1aa6c55d9dd2388778,35-44,Man,"Some college, no degree",2025-07-30 04:18 PM
6455616c0fe23ecca70ce1b9,65 or older,Woman,"Some college, no degree",2025-07-30 04:15 PM
5632410fca59f6000b795a70,45-54,Man,"Some college, no degree",2025-07-30 04:41 PM
653703627539f3a8b2ed4af3,45-54,Man,"Master's degree (e.g., MA, MS, MBA)",2025-07-30 05:59 PM


Duration Statistics (n=10):
mean     12m 7s
std      4m 13s
min      5m 36s
25%     10m 28s
50%     12m 12s
75%     13m 36s
max     20m 39s 

Men: 70.00%, n=7
Women: 30.00%, n=3

Less than high school                             0
High school graduate or equivalent (e.g., GED)    2
Some college, no degree                           4
Associate degree (e.g., AA, AS)                   0
Bachelor's degree (e.g., BA, BS)                  2
Master's degree (e.g., MA, MS, MBA)               1
Graduate degree (e.g., PhD, MD, JD)               1
Prefer not to say                                 0

18-24                0
25-34                3
35-44                1
45-54                4
55-64                1
65 or older          1
Prefer not to say    0


In [None]:
# Looking at quality check.
def get_quality_check(responses):
    data = []

    for response in responses:
        data.append(
            {
                "pid": response["prolific"]["PROLIFIC_PID"],
                "quality_check": response["exitAnswers"]["postLikelihood"],
            }
        )

    return pd.DataFrame(data).set_index("pid")


quality_check_df = get_quality_check(responses)
print(
    quality_check_df["quality_check"]
    .value_counts()
    .reindex(range(1, 6), fill_value=0)
    .rename(None)
    .to_string()
)

quality_check
1    0
2    2
3    3
4    2
5    3


In [None]:
# Get time splits.
def get_time_splits(responses):
    data = []

    for response in responses:

        feeds = response["feeds"]

        data.append(
            {
                "pid": response["prolific"]["PROLIFIC_PID"],
                "total": response["totalDuration"],
                "pre_experiment": response["screenerDuration"],
                "post_experiment": response["exitDuration"],
                "feed_1_selection": response["answers"][feeds[0]]["selectionDuration"],
                "feed_2_selection": response["answers"][feeds[1]]["selectionDuration"],
                "feed_3_selection": response["answers"][feeds[2]]["selectionDuration"],
                "feed_1_rating": response["answers"][feeds[0]]["ratingDuration"],
                "feed_2_rating": response["answers"][feeds[1]]["ratingDuration"],
                "feed_3_rating": response["answers"][feeds[2]]["ratingDuration"],
            }
        )

    return pd.DataFrame(data).set_index("pid")


time_splits_df = get_time_splits(responses)
display(time_splits_df.describe().drop(["count"]).map(convert_seconds).T)

Unnamed: 0,mean,std,min,25%,50%,75%,max
total,12m 7s,4m 13s,5m 36s,10m 28s,12m 12s,13m 36s,20m 39s
pre_experiment,2m 35s,1m 31s,0m 44s,1m 39s,2m 4s,3m 45s,5m 10s
post_experiment,1m 8s,0m 32s,0m 39s,0m 44s,0m 60s,1m 14s,2m 4s
feed_1_selection,1m 6s,0m 32s,0m 21s,0m 48s,0m 59s,1m 15s,2m 6s
feed_2_selection,0m 55s,0m 25s,0m 22s,0m 38s,0m 54s,1m 6s,1m 38s
feed_3_selection,0m 60s,0m 30s,0m 14s,0m 39s,0m 55s,1m 19s,1m 51s
feed_1_rating,2m 7s,0m 45s,1m 5s,1m 38s,2m 7s,2m 30s,3m 41s
feed_2_rating,1m 31s,0m 44s,0m 41s,1m 2s,1m 21s,1m 50s,3m 8s
feed_3_rating,1m 5s,0m 25s,0m 36s,0m 50s,0m 60s,1m 12s,2m 4s


In [None]:
display(
    pd.DataFrame(
        np.concatenate(
            (
                time_splits_df["feed_1_selection"].values,
                time_splits_df["feed_2_selection"].values,
                time_splits_df["feed_3_selection"].values,
            )
        ),
        columns=["selection_duration"],
    )
    .describe()
    .map(convert_seconds)
    .drop("count")
)

display(
    pd.DataFrame(
        np.concatenate(
            (
                time_splits_df["feed_1_rating"].values,
                time_splits_df["feed_2_rating"].values,
                time_splits_df["feed_3_rating"].values,
            )
        ),
        columns=["rating_duration"],
    )
    .describe()
    .map(convert_seconds)
    .drop("count")
)

print(f"{len(time_splits_df):,} observations.")

Unnamed: 0,selection_duration
mean,1m 0s
std,0m 29s
min,0m 14s
25%,0m 41s
50%,0m 56s
75%,1m 15s
max,2m 6s


Unnamed: 0,rating_duration
mean,1m 35s
std,0m 46s
min,0m 36s
25%,1m 2s
50%,1m 21s
75%,2m 5s
max,3m 41s


10 observations.


In [214]:
# Experimental conditions assignments.
# NOTE: Doesn't include the multiple choice randomization order.
def get_experimental_conditions(sessions):
    data = []

    for session in sessions:

        # Check whether the participant complete the study.
        if session["PROLIFIC_PID"] not in demographics_df.index:
            continue

        data.append(
            {
                "pid": session["PROLIFIC_PID"],
                "snapshot_1": session["feeds"][0]["feedUUID"],
                "snapshot_2": session["feeds"][1]["feedUUID"],
                "snapshot_3": session["feeds"][2]["feedUUID"],
                "rotation_1": session["feeds"][0]["rotation"],
                "rotation_2": session["feeds"][1]["rotation"],
                "rotation_3": session["feeds"][2]["rotation"],
                "shown_proof": session["shown_proof"],
            }
        )

    return pd.DataFrame(data).set_index("pid")


experimental_conditions_df = get_experimental_conditions(sessions)
display(experimental_conditions_df)

# What percentage of participants saw social proof?
display(experimental_conditions_df["shown_proof"].value_counts().reindex([True, False]))

Unnamed: 0_level_0,snapshot_1,snapshot_2,snapshot_3,rotation_1,rotation_2,rotation_3,shown_proof
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
656a1cf88a3cf09f538c6f25,cd299601-0ec4-49ab-a410-64489a867dea,c9dfbf3e-8655-4059-a1ac-fdb73fd88764,1f9d2527-6430-4c8e-87cf-37a3e14323be,2,0,1,False
6720f4ced2a11d4e702566fa,1f9d2527-6430-4c8e-87cf-37a3e14323be,cd299601-0ec4-49ab-a410-64489a867dea,749fc4f5-ce90-4a98-b5e7-39db62f1632b,2,9,1,False
66907e71d8f7d4ee88cc59ce,1f9d2527-6430-4c8e-87cf-37a3e14323be,cd299601-0ec4-49ab-a410-64489a867dea,749fc4f5-ce90-4a98-b5e7-39db62f1632b,4,1,8,False
6455616c0fe23ecca70ce1b9,c9dfbf3e-8655-4059-a1ac-fdb73fd88764,1f9d2527-6430-4c8e-87cf-37a3e14323be,a602674f-7bc6-47e7-919b-bcde0dcf5f05,8,3,3,False
56e6a66af6ed900006a5867c,749fc4f5-ce90-4a98-b5e7-39db62f1632b,a602674f-7bc6-47e7-919b-bcde0dcf5f05,1f9d2527-6430-4c8e-87cf-37a3e14323be,3,4,1,True
676fa93a6e1840db7ea43423,a602674f-7bc6-47e7-919b-bcde0dcf5f05,749fc4f5-ce90-4a98-b5e7-39db62f1632b,cd299601-0ec4-49ab-a410-64489a867dea,1,7,8,True
6779c674931f49f5fe68afe3,a602674f-7bc6-47e7-919b-bcde0dcf5f05,749fc4f5-ce90-4a98-b5e7-39db62f1632b,1f9d2527-6430-4c8e-87cf-37a3e14323be,0,3,2,False
67884b1aa6c55d9dd2388778,c9dfbf3e-8655-4059-a1ac-fdb73fd88764,749fc4f5-ce90-4a98-b5e7-39db62f1632b,1f9d2527-6430-4c8e-87cf-37a3e14323be,7,4,4,False
5632410fca59f6000b795a70,749fc4f5-ce90-4a98-b5e7-39db62f1632b,1f9d2527-6430-4c8e-87cf-37a3e14323be,c9dfbf3e-8655-4059-a1ac-fdb73fd88764,0,7,2,False
653703627539f3a8b2ed4af3,a602674f-7bc6-47e7-919b-bcde0dcf5f05,1f9d2527-6430-4c8e-87cf-37a3e14323be,749fc4f5-ce90-4a98-b5e7-39db62f1632b,9,2,5,True


shown_proof
True     3
False    7
Name: count, dtype: int64

In [236]:
# Get the matrix of experimental condition assignments.
def get_condition_matrix(experimental_conditions_df):
    matrix = pd.DataFrame(
        0,
        index=snapshots,
        columns=range(0, 10),
    )

    matrix_proof = pd.DataFrame(
        0,
        index=snapshots,
        columns=range(0, 10),
    )

    matrix_no_proof = pd.DataFrame(
        0,
        index=snapshots,
        columns=range(0, 10),
    )

    for _, row in experimental_conditions_df.iterrows():
        matrix.loc[row["snapshot_1"], row["rotation_1"]] += 1
        if row["shown_proof"]:
            matrix_proof.loc[row["snapshot_1"], row["rotation_1"]] += 1
        else:
            matrix_no_proof.loc[row["snapshot_1"], row["rotation_1"]] += 1

        matrix.loc[row["snapshot_2"], row["rotation_2"]] += 1
        if row["shown_proof"]:
            matrix_proof.loc[row["snapshot_2"], row["rotation_2"]] += 1
        else:
            matrix_no_proof.loc[row["snapshot_2"], row["rotation_2"]] += 1

        matrix.loc[row["snapshot_3"], row["rotation_3"]] += 1
        if row["shown_proof"]:
            matrix_proof.loc[row["snapshot_3"], row["rotation_3"]] += 1
        else:
            matrix_no_proof.loc[row["snapshot_3"], row["rotation_3"]] += 1

    return matrix, matrix_proof, matrix_no_proof


condition_matrix, condition_matrix_proof, condition_matrix_no_proof = (
    get_condition_matrix(experimental_conditions_df)
)
print("Both proof and no proof condition matrices are shown below:")
display(condition_matrix)

print("Condition matrix with proof shown:")
display(condition_matrix_proof)

print("Condition matrix without proof shown:")
display(condition_matrix_no_proof)

Both proof and no proof condition matrices are shown below:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
749fc4f5-ce90-4a98-b5e7-39db62f1632b,1,1,0,2,1,1,0,1,1,0
a602674f-7bc6-47e7-919b-bcde0dcf5f05,1,1,0,1,1,0,0,0,0,1
1f9d2527-6430-4c8e-87cf-37a3e14323be,0,2,3,1,2,0,0,1,0,0
c9dfbf3e-8655-4059-a1ac-fdb73fd88764,1,0,1,0,0,0,0,1,1,0
cd299601-0ec4-49ab-a410-64489a867dea,0,1,1,0,0,0,0,0,1,1


Condition matrix with proof shown:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
749fc4f5-ce90-4a98-b5e7-39db62f1632b,0,0,0,1,0,1,0,1,0,0
a602674f-7bc6-47e7-919b-bcde0dcf5f05,0,1,0,0,1,0,0,0,0,1
1f9d2527-6430-4c8e-87cf-37a3e14323be,0,1,1,0,0,0,0,0,0,0
c9dfbf3e-8655-4059-a1ac-fdb73fd88764,0,0,0,0,0,0,0,0,0,0
cd299601-0ec4-49ab-a410-64489a867dea,0,0,0,0,0,0,0,0,1,0


Condition matrix without proof shown:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
749fc4f5-ce90-4a98-b5e7-39db62f1632b,1,1,0,1,1,0,0,0,1,0
a602674f-7bc6-47e7-919b-bcde0dcf5f05,1,0,0,1,0,0,0,0,0,0
1f9d2527-6430-4c8e-87cf-37a3e14323be,0,1,2,1,2,0,0,1,0,0
c9dfbf3e-8655-4059-a1ac-fdb73fd88764,1,0,1,0,0,0,0,1,1,0
cd299601-0ec4-49ab-a410-64489a867dea,0,1,1,0,0,0,0,0,0,1


In [None]:
def get_rank(snapshot: str, post: str, rotation: int, shown_proof: bool) -> int:
    """
    This is required because we need to look up the particular feed UUID
    which depends on the snapshot, rotation, and whether proof is shown.

    Using that information, we can get the list of posts and search for
    the index of the post in that list.
    """
    return (
        meta[snapshot]["feeds"][str(rotation)]["proof" if shown_proof else "noProof"][
            "posts"
        ].index(post)
        + 1
    )


def get_feed(snapshot: str, rotation: int, shown_proof: bool) -> int:
    """
    Each snapshot consists of multiple feeds, each with a unique UUID,
    that uniquely identify the snapshot, the rotation, and whether proof is shown.
    """
    return meta[snapshot]["feeds"][str(rotation)][
        "proof" if shown_proof else "noProof"
    ]["feedUUID"]

In [None]:
# Get data frame that contains all the selections they made and where the post was in the feed.
def get_selections(responses):

    assert (
        experimental_conditions_df is not None
    ), "Experimental conditions must be loaded first."

    data = []

    # Go through each response.
    for response in responses:

        # Get the snapshots that they (PID) saw.
        snapshots = response["feeds"]
        pid = response["prolific"]["PROLIFIC_PID"]
        shown_proof = experimental_conditions_df.loc[pid, "shown_proof"]

        # Go through each snapshot, what rotation they saw, and the posts on that permutation.
        # snapshot, shown_proof, rotation --> feed UUID
        for i, snapshot in enumerate(snapshots):
            rotation = experimental_conditions_df.loc[pid, f"rotation_{i+1}"]

            posts = meta[snapshot]["feeds"][str(rotation)][
                "proof" if shown_proof else "noProof"
            ]["posts"]
            selected_posts = response["answers"][snapshot]["selectedPosts"]

            # Add each post to the data frame, highlighting whether it was selected.
            for post in posts:

                data.append(
                    {
                        "pid": pid,
                        "rotation": rotation,
                        "shown_proof": shown_proof,
                        "snapshot": snapshot,
                        "feed": get_feed(snapshot, rotation, shown_proof),
                        "post": post,
                        "rank": get_rank(snapshot, post, rotation, shown_proof),
                        "selected": post in selected_posts,
                    }
                )

    return pd.DataFrame(data).set_index("pid")


selections_df = get_selections(responses)
display(selections_df)

Unnamed: 0_level_0,rotation,shown_proof,snapshot,feed,post,rank,selected
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,48d21e8f-1da3-4404-934d-25db6e7de1ca,1,False
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,3054b505-f3c9-4118-8bee-74d8dda4fa7e,2,False
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,00e338e5-cb27-4a69-a84e-b8d84a2af176,3,True
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,5c9aab83-d420-4cca-9e90-8c4758ce3962,4,True
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,6cdddddc-f259-498e-aec0-432c5afab5da,5,False
...,...,...,...,...,...,...,...
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,7263288c-4183-44e5-b51d-95e58796ae88,6,False
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,83b9294f-7e60-4784-972c-a423aa971685,7,False
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,29db03c0-88aa-475b-b878-2cbfd423357a,8,False
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,641808f2-0a8f-4994-9d58-451524c112ae,9,True


In [237]:
# Where did people select posts?
print("Where selections took place:")
display(
    selections_df.query("selected")["rank"]
    .value_counts()
    .sort_index()
    .to_frame()
    .rename(columns={"count": "both"})
    .join(
        selections_df.query("selected and shown_proof")["rank"]
        .value_counts()
        .reindex(range(1, 11), fill_value=0)
        .to_frame()
        .rename(columns={"count": "with_proof"})
    )
    .join(
        selections_df.query("selected and not shown_proof")["rank"]
        .value_counts()
        .reindex(range(1, 11), fill_value=0)
        .to_frame()
        .rename(columns={"count": "without_proof"})
    )
)

Where selections took place:


Unnamed: 0_level_0,both,with_proof,without_proof
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5,1,4
2,8,2,6
3,8,2,6
4,14,5,9
5,8,3,5
6,3,0,3
7,5,2,3
8,8,1,7
9,10,2,8
10,11,5,6


In [None]:
def get_ratings(responses):

    data = []

    for response in responses:

        # Get the snapshots that they (PID) saw.
        snapshots = response["feeds"]
        pid = response["prolific"]["PROLIFIC_PID"]
        shown_proof = experimental_conditions_df.loc[pid, "shown_proof"]

        # Go through each snapshot, what rotation they saw, and the posts on that permutation.
        # snapshot, shown_proof, rotation --> feed UUID
        for i, snapshot in enumerate(snapshots):
            rotation = experimental_conditions_df.loc[pid, f"rotation_{i+1}"]

            rated_posts = response["answers"][snapshot]["ratings"]
            selected_posts = response["answers"][snapshot]["selectedPosts"]

            # Go through each rated post, extract its ratings.
            for post, ratings in rated_posts.items():
                data.append(
                    {
                        "pid": pid,
                        "rotation": rotation,
                        "shown_proof": shown_proof,
                        "snapshot": snapshot,
                        "feed": get_feed(snapshot, rotation, shown_proof),
                        "post": post,
                        "rank": get_rank(snapshot, post, rotation, shown_proof),
                        "selected": post in selected_posts,
                        "relevance": ratings["relevance"],
                        "manipulation": ratings["manipulation"],
                        "quality": ratings["quality"],
                    }
                )

    return pd.DataFrame(data).set_index("pid")


ratings_df = get_ratings(responses)
display(ratings_df)

Unnamed: 0_level_0,rotation,shown_proof,snapshot,feed,post,rank,selected,relevance,manipulation,quality
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,00e338e5-cb27-4a69-a84e-b8d84a2af176,3,True,4,4,3
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,3054b505-f3c9-4118-8bee-74d8dda4fa7e,2,False,4,4,3
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,5c9aab83-d420-4cca-9e90-8c4758ce3962,4,True,4,5,5
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,6cdddddc-f259-498e-aec0-432c5afab5da,5,False,2,2,3
66907e71d8f7d4ee88cc59ce,4,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,bb54b3a2-5be7-46f3-937d-1a9d8e38ecc5,60d908e3-93dc-4dc6-9ce8-b93410006aea,6,False,2,2,4
...,...,...,...,...,...,...,...,...,...,...
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,ca100c96-a818-4cfc-b719-0cc7ef80c2fe,3,True,3,4,2
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,90aa660f-f86c-4ebb-8024-9613fa5d966b,4,True,2,1,3
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,c24f6d87-b363-4dda-9a1b-8be18b666ab2,5,False,4,2,4
653703627539f3a8b2ed4af3,5,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,af2b73d3-a933-47d1-bf4b-287a66f04971,7263288c-4183-44e5-b51d-95e58796ae88,6,False,1,1,3


In [None]:
# Count the number of selected posts for each participant and their feeds.
selections_per_feed_df = (
    selections_df.query("selected")
    .groupby(["pid", "rotation", "shown_proof", "snapshot"])["selected"]
    .count()
    .to_frame()
)
display(selections_per_feed_df)

selections_per_feed_df["selected"].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,selected
pid,rotation,shown_proof,snapshot,Unnamed: 4_level_1
5632410fca59f6000b795a70,0,False,749fc4f5-ce90-4a98-b5e7-39db62f1632b,3
5632410fca59f6000b795a70,2,False,c9dfbf3e-8655-4059-a1ac-fdb73fd88764,3
5632410fca59f6000b795a70,7,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,2
56e6a66af6ed900006a5867c,1,True,1f9d2527-6430-4c8e-87cf-37a3e14323be,3
56e6a66af6ed900006a5867c,3,True,749fc4f5-ce90-4a98-b5e7-39db62f1632b,3
56e6a66af6ed900006a5867c,4,True,a602674f-7bc6-47e7-919b-bcde0dcf5f05,3
6455616c0fe23ecca70ce1b9,3,False,1f9d2527-6430-4c8e-87cf-37a3e14323be,3
6455616c0fe23ecca70ce1b9,3,False,a602674f-7bc6-47e7-919b-bcde0dcf5f05,3
6455616c0fe23ecca70ce1b9,8,False,c9dfbf3e-8655-4059-a1ac-fdb73fd88764,3
653703627539f3a8b2ed4af3,2,True,1f9d2527-6430-4c8e-87cf-37a3e14323be,2


selected
3    23
2     4
1     3
Name: count, dtype: int64

In [None]:
# Did selected posts have higher ratings?
display(
    ratings_df.query("selected")[["relevance", "manipulation", "quality"]]
    .describe()
    .T.astype(
        {
            "count": "int",
            "min": "int",
            "25%": "int",
            "50%": "int",
            "75%": "int",
            "max": "int",
        }
    )
    .drop(columns=["min", "max"])
    .round(2)
)

display(
    ratings_df.query("not selected")[["relevance", "manipulation", "quality"]]
    .describe()
    .T.astype(
        {
            "count": "int",
            "min": "int",
            "25%": "int",
            "50%": "int",
            "75%": "int",
            "max": "int",
        }
    )
    .drop(columns=["min", "max"])
    .round(2)
)

Unnamed: 0,count,mean,std,25%,50%,75%
relevance,80,3.51,1.42,2,4,5
manipulation,80,2.31,1.36,1,2,3
quality,80,3.48,1.03,3,3,4


Unnamed: 0,count,mean,std,25%,50%,75%
relevance,90,1.79,1.19,1,1,2
manipulation,90,2.76,1.36,2,2,4
quality,90,2.54,1.11,2,3,3


In [None]:
def get_feedback(responses):

    data = []

    for response in responses:
        data.append(
            {
                "pid": response["prolific"]["PROLIFIC_PID"],
                "feedback": response["exitAnswers"]["feedback"],
            }
        )

    return pd.DataFrame(data).set_index("pid")


feedback_df = get_feedback(responses)
display(feedback_df)

Unnamed: 0_level_0,feedback
pid,Unnamed: 1_level_1
66907e71d8f7d4ee88cc59ce,none
6720f4ced2a11d4e702566fa,No
656a1cf88a3cf09f538c6f25,
676fa93a6e1840db7ea43423,
6779c674931f49f5fe68afe3,
56e6a66af6ed900006a5867c,
67884b1aa6c55d9dd2388778,Many of the posts that I was shown during this...
6455616c0fe23ecca70ce1b9,No
5632410fca59f6000b795a70,.
653703627539f3a8b2ed4af3,"No feedback, it was fine. Thanks!"


In [287]:
def get_explanations(responses):

    data = []

    for response in responses:
        data.append(
            {
                "pid": response["prolific"]["PROLIFIC_PID"],
                "selection": set(
                    filter(
                        lambda d: type(d) is str,
                        response["exitAnswers"]["selectionExplained"],
                    )
                ),
                "selection_other": (
                    list(
                        filter(
                            lambda d: type(d) is dict and d != "Other",
                            response["exitAnswers"]["selectionExplained"],
                        )
                    )[0]["value"]
                    if any(
                        type(d) is dict and d != "Other"
                        for d in response["exitAnswers"]["selectionExplained"]
                    )
                    else ""
                ),
                "selected": set(
                    filter(
                        lambda d: type(d) is str,
                        response["exitAnswers"]["selectedPostExplained"],
                    )
                ),
                "selected_other": (
                    list(
                        filter(
                            lambda d: type(d) is dict and d != "Other",
                            response["exitAnswers"]["selectedPostExplained"],
                        )
                    )[0]["value"]
                    if any(
                        type(d) is dict and d != "Other"
                        for d in response["exitAnswers"]["selectedPostExplained"]
                    )
                    else ""
                ),
                "not_selected": set(
                    filter(
                        lambda d: type(d) is str,
                        response["exitAnswers"]["nonSelectedPostExplained"],
                    )
                ),
                "not_selected_other": (
                    list(
                        filter(
                            lambda d: type(d) is dict and d != "Other",
                            response["exitAnswers"]["nonSelectedPostExplained"],
                        )
                    )[0]["value"]
                    if any(
                        type(d) is dict and d != "Other"
                        for d in response["exitAnswers"]["nonSelectedPostExplained"]
                    )
                    else ""
                ),
            }
        )

    return pd.DataFrame(data).set_index("pid")


explanations_df = get_explanations(responses)
display(explanations_df)

Unnamed: 0_level_0,selection,selection_other,selected,selected_other,not_selected,not_selected_other
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
66907e71d8f7d4ee88cc59ce,{content},,{relevance},,{relevance},
6720f4ced2a11d4e702566fa,{content},,{relevance},,{},"There was no need to select it, as you could s..."
656a1cf88a3cf09f538c6f25,{content},,{relevance},,{relevance},
676fa93a6e1840db7ea43423,{content},,{relevance},,{relevance},
6779c674931f49f5fe68afe3,{content},,{},Seemed like an interesting read,{},Don't want to over consume news
56e6a66af6ed900006a5867c,{content},,{relevance},,{content_quality},
67884b1aa6c55d9dd2388778,{content},,{trustworthiness},,{relevance},
6455616c0fe23ecca70ce1b9,{content},,{},It's something that's currently relevant in th...,{relevance},
5632410fca59f6000b795a70,{content},,{content_quality},,{content_quality},
653703627539f3a8b2ed4af3,"{content, subreddit}",,"{relevance, trustworthiness}",,{relevance},


In [298]:
def aggregate_reasons(explanations_df):

    return {
        "selection": pd.concat(
            [
                explanations_df["selection"].explode().value_counts(),
                pd.Series(
                    {"other": explanations_df.query("selection_other != ''").shape[0]}
                ),
            ]
        ).to_frame(name="selection"),
        "selected": pd.concat(
            [
                explanations_df["selected"].explode().value_counts(),
                pd.Series(
                    {"other": explanations_df.query("selected_other != ''").shape[0]}
                ),
            ]
        ).to_frame(name="selected"),
        "not_selected": pd.concat(
            [
                explanations_df["not_selected"].explode().value_counts(),
                pd.Series(
                    {
                        "other": explanations_df.query(
                            "not_selected_other != ''"
                        ).shape[0]
                    }
                ),
            ]
        ).to_frame(name="not_selected"),
    }


reasons_df = aggregate_reasons(explanations_df)
display(reasons_df["selection"])
display(reasons_df["selected"])
display(reasons_df["not_selected"])

Unnamed: 0,selection
content,10
subreddit,1
other,0


Unnamed: 0,selected
relevance,6
trustworthiness,2
content_quality,1
other,2


Unnamed: 0,not_selected
relevance,6
content_quality,2
other,2
