In [1]:
# Note that this whole thing will run a whole lot faster if you have CUDA set up.
#! pip install -U archetyper

In [2]:
import csv
import pandas as pd

# for development, we try to import first from the src folder
try:
    from src.archetypes.archetypes import ArchetypeCollection, ArchetypeQuantifier

# if this fails, we just import as we normally would when the package is installed
except:
    from archetypes.archetypes import ArchetypeCollection, ArchetypeQuantifier

In [3]:
# Specify the model that we would like to use for our analyses
model_name = 'sentence-transformers/all-roberta-large-v1' 

In [4]:
# There are two ways in which we can create "Archetypes" from prototypical sentences. 

# The first method, which is a more manual approach, is to instantiate a member of our Archetype_Collection class.
# Then, we can individually add prototype sentences that are mapped to constructs.
archetypes = ArchetypeCollection()

archetypes.add_archetype(name="Acquired Capability - Ideation/Simulation",
                         sentences = ["I think about putting a rope around my neck",
                                      "I want to put a gun in my mouth and pull the trigger",
                                      "I plan on taking a bunch of pills and just fall asleep forever"])

archetypes.add_archetype(name="Perceived Burdensomness",
                         sentences = ["The world would be a better place without me",
                                      "I add nothing to the world",
                                      "Things would be better if I was not here."])

archetypes.add_archetype(name="Thwarted Belongingness",
                         sentences = ["I am alone",
                                      "I don't fit in anywhere",
                                      "Everyone hates me"])

#... and so on...

Archetype added: Acquired Capability - Ideation/Simulation
Archetype added: Perceived Burdensomness
Archetype added: Thwarted Belongingness


In [5]:
# The second, and "cleaner" method, is to load them from a CSV file, as we're doing here.

# instantiate a member of our Archetype_Collection class
archetypes = ArchetypeCollection()

# load in our archetypes from a CSV file
archetypes.add_archetypes_from_CSV(filepath="example_archetypes/Suicidality-Archetypes.csv",
                                   file_encoding="utf-8-sig",
                                   file_has_headers=True)


Archetype added: Acquired Capability - Ideation/Simulation
Archetype added: Acquired Capability - Experiences of Endurance
Archetype added: Acquired Capability - Desensitization to Harm
Archetype added: Acquired Capability - High Tolerance for Physical Pain
Archetype added: Acquired Capability - Engagement in Risky Behaviors
Archetype added: Acquired Capability - Familiarity with Self-Harm Methods
Archetype added: Perceived Burdensomness
Archetype added: Thwarted Belongingness


In [6]:
# Now, we can initialize an ArchetypeQuantifier with the archetypes that we set up above
archetype_quantifier = ArchetypeQuantifier(archetypes=archetypes,
                                           model=model_name)

ArchetypeQuantifier has been successfully instantiated.


In [7]:
# check the archetype names and the order they appear in
archetype_quantifier.get_list_of_archetypes()

['Acquired Capability - Ideation/Simulation',
 'Acquired Capability - Experiences of Endurance',
 'Acquired Capability - Desensitization to Harm',
 'Acquired Capability - High Tolerance for Physical Pain',
 'Acquired Capability - Engagement in Risky Behaviors',
 'Acquired Capability - Familiarity with Self-Harm Methods',
 'Perceived Burdensomness',
 'Thwarted Belongingness']

In [8]:
# Here, we're going to do a few things to get a descriptive sense of the psychometrics of our archetypes.
# First, let's just get item-level correlations for all of our archetypes. We need to specify
# the output folder that we would like to export our correlation matrices into.
archetype_quantifier.export_intra_archetype_correlations(output_folder="ItemCorrelations/",
                                                            mean_center_vectors=True)

Successfully exported intra-archetype cosine similarity matrix for: Acquired Capability - Ideation/Simulation
Successfully exported intra-archetype cosine similarity matrix for: Acquired Capability - Experiences of Endurance
Successfully exported intra-archetype cosine similarity matrix for: Acquired Capability - Desensitization to Harm
Successfully exported intra-archetype cosine similarity matrix for: Acquired Capability - High Tolerance for Physical Pain
Successfully exported intra-archetype cosine similarity matrix for: Acquired Capability - Engagement in Risky Behaviors
Successfully exported intra-archetype cosine similarity matrix for: Acquired Capability - Familiarity with Self-Harm Methods
Successfully exported intra-archetype cosine similarity matrix for: Perceived Burdensomness
Successfully exported intra-archetype cosine similarity matrix for: Thwarted Belongingness


In [9]:
# What if we want to look at the relationships across all archetypes and their constitutent prototypical sentences?
# Why, we can do that! What an exciting time to be alive!
archetype_quantifier.export_all_archetype_relationships(output_file_location="ItemCorrelations/All_Archetype_Relationships.csv",
                                                        mean_center_vectors=True)

Calculating all relationships within/across all archetypes...
All relationships exported to: ItemCorrelations/All_Archetype_Relationships.csv


In [10]:
# What if we want to actually get the raw vectors for each archetype/prototype? We can do that too!
# You might want to do this, for example, to run something like a confirmatory factor analysis to
# sanity check the structure of your archetypes.
archetype_quantifier.export_all_archetype_vectors(output_file_location="ItemCorrelations/Archetype_Vectors.csv",
                                                  mean_center_vectors=True)

All archetype vectors have been exported.


In [11]:
# Now, let's evaluate the "internal consistency" of our archetypes in a rough, Boyd-esque fashion.
archetype_quantifier.evaluate_archetype_consistency(mean_center_vectors=True)

# In theory, we're looking for Cronbach's alpha / item-rest cosine similarities in the neighborhood of >= .70

Evaluating Acquired Capability - Ideation/Simulation...
	0.78127: I think about putting a rope around my neck
	0.80222: I want to put a gun in my mouth and pull the trigger
	0.75624: I plan on taking a bunch of pills and just fall asleep forever
	--------------------
	0.77991: Average item-rest correlation
	0.67798: Cronbach's alpha


Evaluating Acquired Capability - Experiences of Endurance...
	0.86638: I've been through so much pain in my life that I feel like nothing can hurt me anymore
	0.81043: Overcoming those challenges made me realize I can endure a lot more than I thought.
	0.85491: I've become numb to the pain and it takes a lot to bother me now.
	--------------------
	0.84391: Average item-rest correlation
	0.79793: Cronbach's alpha


Evaluating Acquired Capability - Desensitization to Harm...
	0.80566: I've seen and experienced so much violence that it doesn't faze me anymore.
	0.80979: I can handle situations that used to terrify me, it's like I'm immune to the fear.
	0.75

In [12]:
# Now, let's go ahead and read in the dataset that we want to analyze.
df = pd.read_csv('example_data/social_media_dataset.csv')

df.head()

Unnamed: 0,subreddit,author,created_utc,id,title,selftext
0,SuicideWatch,tait_sa,1454254000.0,43jg8v,Troubled,Life is hell. \r\n\r\nI know that is a common ...
1,SuicideWatch,Sadnessforevert,1454255000.0,43jh28,No friends for over 8 years...I need to die,"Raised in an abusive family, I have had 3 boyf..."
2,SuicideWatch,Thisexistencehurts,1454257000.0,43jm4b,Anyone else out there that would have ended it...,My family is the only reason I havent killed m...
3,SuicideWatch,Rhexysexy,1454257000.0,43jnbl,I don't deserve to live anymore,As simple as that. I'm a complete failure. My ...
4,SuicideWatch,LoveArt96,1454259000.0,43jq7i,Anyone overdosed paracetamol?,"I tried one time,10g , but didnt work... Now i..."


In [13]:
# now, let's just pull out the texts and put them in a list.
texts = df["selftext"].tolist()

# metadata that we want to retainfor the texts that we want to analyze
text_metadata = {
    "author": df["author"].tolist(),
    "created_utc": df["created_utc"].tolist(),
    "post_id": df["id"].tolist()
    }


In [None]:
# Now, we're off to the races! This will batch-analyze the texts in our dataset, exporting our results into
# a sentence-level output file and a document-level output file.

#Note that doing a Fisher Z-transform may or may not be desirable, depending on your constructs/archetypes of interest.
archetype_quantifier.batch_analyze_to_csv(texts = texts,
                                          text_metadata = text_metadata,
                                          csv_sent_output_location = 'archetypes_sent.csv',
                                          csv_doc_output_location = 'archetypes_doc.csv',
                                          append_to_existing_csv = False,
                                          output_encoding = 'utf-8-sig',
                                          mean_center_vectors=True,
                                          fisher_z_transform=False,
                                          doc_avgs_exclude_sents_with_WC_less_than=5,
                                          doc_level_aggregation_type="mean")

 26%|███████████████████▋                                                         | 44/172 [03:28<07:04,  3.32s/it]

In [None]:
# We can also just apply this to individual texts, just in case you want to test things out,
# or if you do not need to batch-analyze a dataset like the example above. This also lets
# you build your own pipeline around the library, iterating and exporting results however
# best suits your needs.
from pprint import pprint

example_text = "General Kenobi, you are a bold one. I find your behavior bewildering. Surely you realize you're doomed."

archetype_quantifier.analyze(example_text,
                             mean_center_vectors=True,
                             fisher_z_transform=False,)

results = archetype_quantifier.results

for result in results:
    print(f"Sentence Text: {result.sentence_text}")
    print(f"Word Count: {result.WC}")
    print("Archetype scores:")
    pprint(result.archetype_scores)
    print("\n")
