In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [2]:
import pickle
e_all_posts = pickle.load(open("data/euphoria_posts_all.pkl", "rb"))
e_drug_posts = pickle.load(open("data/euphoria_posts_drugs.pkl", "rb"))

In [3]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

e_non_drug_posts = e_all_posts[~e_all_posts['id'].isin(e_drug_posts['id'])]
# taken from https://nida.nih.gov/research-topics/addiction-science/words-matter-preferred-language-talking-about-addiction
key_words_person = ['addict','addicts', 'addiction', 'sober', 'recovery', 'relapse', 'binge',
                     'trip', 'overdose','abuser','abuse', 'junkie', 'misuse', 'use', 'user',
                    'habit', 'clean','abstinence', 'abstinent', 'detox', 'withdrawal']
key_words_drug = ['cocaine', 'methamphetamine', 'heroin', 'marijuana','opiate', 'opiates',
                  'opioid', 'opioids', 'percocet', 'xanax','ecstasy', 'mdma', 'lsd','mushrooms',
                 'ketamine']
key_words_connection = ['resonate', 'accurate', 'identity', 'identify', 'connect','connection',
                        'represent', 'representation', 'empathy', 'empathize', 'understand', 'understanding',
                        'sympathize', 'sympathy', 'similar','relate', 'relatable', 'relating']
# clean and tokenize
def clean_tokenize(text):
    text = text.lower()
    text = gensim.utils.simple_preprocess(text)
    return text

# tokenize all posts
e_non_drug_posts['all_text'] = e_non_drug_posts['title'] + ' ' + e_non_drug_posts['selftext']
e_drug_posts['all_text'] = e_drug_posts['title'] + ' ' + e_drug_posts['selftext']

e_non_drug_posts['tokens'] = e_non_drug_posts['all_text'].apply(clean_tokenize)
e_drug_posts['tokens'] = e_drug_posts['all_text'].apply(clean_tokenize)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_non_drug_posts['all_text'] = e_non_drug_posts['title'] + ' ' + e_non_drug_posts['selftext']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_non_drug_posts['tokens'] = e_non_drug_posts['all_text'].apply(clean_tokenize)


In [4]:
print("number of posts from users not active in any of the 18 drug subreddits: " ,e_non_drug_posts.shape)
print("number of posts from users active in both euphoria and drug subreddits: ",e_drug_posts.shape)
print("number of unique users in ED: ", e_drug_posts['author'].nunique())
print("number of unique users in non-ED: ", e_non_drug_posts['author'].nunique())

number of posts from users not active in any of the 18 drug subreddits:  (28301, 10)
number of posts from users active in both euphoria and drug subreddits:  (1746, 10)
number of unique users in ED:  889
number of unique users in non-ED:  13558


In [8]:
# avg post length
print("avg post length non-ED: ", e_non_drug_posts['all_text'].str.len().mean())
print("avg post length ED: ", e_drug_posts['all_text'].str.len().mean())

avg post length non-ED:  264.2162821101728
avg post length ED:  308.3081328751432


In [5]:
pickle.dump(e_non_drug_posts, open("data/euphoria_posts_non_drug.pkl", "wb"))

In [6]:
print("date range of posts: ", e_non_drug_posts['created_utc'].min(), e_non_drug_posts['created_utc'].max())

date range of posts:  2019-06-11 23:02:28 2022-12-31 23:34:03


---
TRAIN W2V

In [None]:
num_features = 100;    # Dimensionality of the hidden layer representation
min_word_count = 5;   # Minimum word count to keep a word in the vocabulary
context = 5         # Context window size (on each side)
downsampling = 1e-3   # Downsample setting for frequent words

model_non_drug = Word2Vec(e_non_drug_posts['tokens'], workers=4, vector_size=num_features, min_count = min_word_count, window = context,
                          sample = downsampling, sg=1, epochs=5)
model_drug = Word2Vec(e_drug_posts['tokens'], workers=4, vector_size=num_features, min_count = min_word_count, window = context,
                        sample = downsampling, sg=1, epochs=5)

In [None]:
# create keyword vectors
keyword_vectors_non_drug = {}
keyword_vectors_drug = {}

keywords_combined = key_words_person + key_words_drug + key_words_connection

for word in keywords_combined:
    if word in model_non_drug.wv.key_to_index:
        keyword_vectors_non_drug[word] = model_non_drug.wv[word]
    else:
        pass

    if word in model_drug.wv.key_to_index:
        keyword_vectors_drug[word] = model_drug.wv[word]
    else:
        pass

print(keyword_vectors_non_drug.keys())
print(keyword_vectors_drug.keys())

dict_keys(['addict', 'addicts', 'addiction', 'sober', 'recovery', 'relapse', 'binge', 'trip', 'overdose', 'abuser', 'abuse', 'junkie', 'use', 'user', 'habit', 'clean', 'detox', 'withdrawal', 'cocaine', 'heroin', 'marijuana', 'opiate', 'opiates', 'opioid', 'opioids', 'xanax', 'ecstasy', 'lsd', 'accurate', 'identity', 'identify', 'connect', 'connection', 'represent', 'representation', 'empathy', 'empathize', 'understand', 'understanding', 'sympathize', 'sympathy', 'similar', 'relate', 'relatable', 'relating'])
dict_keys(['addict', 'addicts', 'addiction', 'sober', 'recovery', 'relapse', 'binge', 'trip', 'overdose', 'abuse', 'use', 'clean', 'withdrawal', 'heroin', 'opiate', 'opiates', 'opioid', 'opioids', 'xanax', 'mdma', 'accurate', 'identity', 'connect', 'connection', 'representation', 'understand', 'understanding', 'sympathy', 'similar', 'relate', 'relatable'])


In [None]:
# keyword_vectors_drug['abstinent'] = (model_drug.wv['abstinence'] + model_drug.wv['abstinent']) / 2
# keyword_vectors_non_drug['abstinent'] = (model_non_drug.wv['abstinence'] + model_non_drug.wv['abstinent']) / 2

keyword_vectors_non_drug['opiate'] = (model_non_drug.wv['opiate'] + model_non_drug.wv['opiates'] + model_non_drug.wv['opioid'] + model_non_drug.wv['opioids']) / 4
keyword_vectors_drug['opiate'] = (model_drug.wv['opiate'] + model_drug.wv['opiates'] + model_drug.wv['opioid'] + model_drug.wv['opioids']) / 4

keyword_vectors_drug['addict'] = (model_drug.wv['addict'] + model_drug.wv['addicts']) / 2
keyword_vectors_non_drug['addict'] = (model_non_drug.wv['addict'] + model_non_drug.wv['addicts']) / 2

keyword_vectors_drug['relate'] = (model_drug.wv['relate'] + model_drug.wv['relatable'] ) / 2
keyword_vectors_non_drug['relate'] = (model_non_drug.wv['relate'] + model_non_drug.wv['relatable'] + model_non_drug.wv['relating']) / 3

keyword_vectors_drug['connect'] = (model_drug.wv['connect'] + model_drug.wv['connection']) / 2
keyword_vectors_non_drug['connect'] = (model_non_drug.wv['connect'] + model_non_drug.wv['connection']) / 2

keyword_vectors_drug['understand'] = (model_drug.wv['understand'] + model_drug.wv['understanding']) / 2
keyword_vectors_non_drug['understand'] = (model_non_drug.wv['understand'] + model_non_drug.wv['understanding']) / 2

# word NOT in vocabulary
# keyword_vectors_drug['empathize'] = (model_drug.wv['empathize'] + model_drug.wv['empathy']) / 2
keyword_vectors_non_drug['empathize'] = (model_non_drug.wv['empathize'] + model_non_drug.wv['empathy']) / 2

# keyword_vectors_drug['sympathize'] = (model_drug.wv['sympathize'] + model_drug.wv['sympathy']) / 2
keyword_vectors_non_drug['sympathize'] = (model_non_drug.wv['sympathize'] + model_non_drug.wv['sympathy']) / 2

# keyword_vectors_drug['represent'] = (model_drug.wv['represent'] + model_drug.wv['representation']) / 2
keyword_vectors_non_drug['represent'] = (model_non_drug.wv['represent'] + model_non_drug.wv['representation']) / 2

# remove vectors that were merged
keyword_vectors_non_drug_filtered = {}
keyword_vectors_drug_filtered = {}
for word in keyword_vectors_non_drug:
    if word not in ['opiates', 'opioid', 'opioids', 'addicts', 'relatable', 'relating', 'connection', 'understanding',
                    'empathy', 'sympathy', 'representation']:
        keyword_vectors_non_drug_filtered[word] = keyword_vectors_non_drug[word]

for word in keyword_vectors_drug:
    if word not in ['opiates', 'opioid', 'opioids', 'addicts', 'relatable', 'connection', 'understanding']:
        keyword_vectors_drug_filtered[word] = keyword_vectors_drug[word]

In [None]:
# find each keyword closest neighbors by cosine distance
# 10 nearest
keyword_neighbors_non_drug = {}
keyword_neighbors_drug = {}

for word in keyword_vectors_non_drug_filtered:
    if word in model_non_drug.wv.key_to_index:
        keyword_neighbors_non_drug[word] = model_non_drug.wv.most_similar(word, topn=10)
    else:
        keyword_neighbors_non_drug[word] = []

for word in keyword_vectors_drug_filtered:
    if word in model_drug.wv.key_to_index:
        keyword_neighbors_drug[word] = model_drug.wv.most_similar(word, topn=10)
    else:
        keyword_neighbors_drug[word] = []

In [None]:
# save model and keyword vectors
model_non_drug.save('models/non_drug')
model_drug.save('models/drug')

pickle.dump(keyword_vectors_non_drug_filtered, open('data/keyword_vectors_non_drug_filtered.pkl', 'wb'))
pickle.dump(keyword_vectors_drug_filtered, open('data/keyword_vectors_drug_filtered.pkl', 'wb'))

In [None]:
# drug neighbors
for word in keyword_neighbors_drug:
    print(word)
    print(keyword_neighbors_drug[word])

addict
[('old', 0.9439898729324341), ('girl', 0.9429236054420471), ('taking', 0.9424818158149719), ('ex', 0.9421994686126709), ('being', 0.9393845796585083), ('addiction', 0.9392651915550232), ('recovering', 0.9385985136032104), ('abusive', 0.9381674528121948), ('friend', 0.9380826950073242), ('sober', 0.9362052083015442)]
addiction
[('use', 0.9529141783714294), ('drug', 0.9430344104766846), ('dealer', 0.9424883723258972), ('issues', 0.9415510892868042), ('problems', 0.9406085014343262), ('addict', 0.9392650723457336), ('sobriety', 0.9388989210128784), ('close', 0.9387733340263367), ('maybe', 0.9377748370170593), ('without', 0.9375696778297424)]
sober
[('taking', 0.9936378002166748), ('cuz', 0.9930742383003235), ('turn', 0.9926009774208069), ('losing', 0.992094099521637), ('rehab', 0.9918918013572693), ('mental', 0.9918874502182007), ('growing', 0.9917038083076477), ('deeply', 0.9915683269500732), ('holy', 0.9914957880973816), ('ass', 0.9912084341049194)]
recovery
[('ideas', 0.99568402

In [None]:
# export
df = pd.DataFrame(keyword_neighbors_non_drug)
df.to_excel('excel/keyword_neighbors_non_drug.xlsx')

df2 = pd.DataFrame(keyword_neighbors_drug)
df2.to_excel('excel/keyword_neighbors_drug.xlsx')

In [None]:
for word in keyword_neighbors_non_drug:
    print(word)
    print(keyword_neighbors_non_drug[word])

addict
[('recovering', 0.7895491123199463), ('alcoholic', 0.7866343855857849), ('opiate', 0.7604430913925171), ('functioning', 0.7336889505386353), ('opioid', 0.7025606632232666), ('functional', 0.6990323662757874), ('suicidal', 0.6922507882118225), ('addicts', 0.6913089752197266), ('opiates', 0.6908101439476013), ('diagnosed', 0.6904293894767761)]
addiction
[('addictions', 0.7935065627098083), ('depression', 0.7860459089279175), ('illness', 0.7765669226646423), ('dependency', 0.7586498856544495), ('grief', 0.753545880317688), ('struggles', 0.7529999613761902), ('severe', 0.7502891421318054), ('struggle', 0.7424672842025757), ('recovery', 0.7322182059288025), ('suicidal', 0.7258968949317932)]
sober
[('clean', 0.84048992395401), ('recovery', 0.7898668646812439), ('burden', 0.746875524520874), ('stayed', 0.7257164120674133), ('relapsed', 0.7061238288879395), ('rehab', 0.7055061459541321), ('opiates', 0.7000667452812195), ('stable', 0.6980233192443848), ('meds', 0.6926373243331909), ('rel

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE
import plotly.graph_objects as go

# t-SNE
tsne = TSNE(n_components=2, random_state=2024, perplexity=30, learning_rate=50)
word_vectors_non_drug = np.array(list(keyword_vectors_non_drug_filtered.values()))
word_vectors_drug = np.array(list(keyword_vectors_drug_filtered.values()))

word_vectors_non_drug_embedded = tsne.fit_transform(word_vectors_non_drug)

# plotly
fig = go.Figure()

fig.add_trace(go.Scatter(x=word_vectors_non_drug_embedded[:,0], y=word_vectors_non_drug_embedded[:,1],
                    mode='markers',
                    text=list(keyword_vectors_non_drug_filtered.keys())))
# label points
for i, word in enumerate(list(keyword_vectors_non_drug_filtered.keys())):
    fig.add_annotation(x=word_vectors_non_drug_embedded[i,0], y=word_vectors_non_drug_embedded[i,1],
                       text=word, showarrow=False)
# plot size
fig.update_layout(width=1200, height=800)
fig.update_layout(title='t-SNE of Word Vectors (Non-Drug)')
fig.show()

In [None]:
# drug neighbors
tsne = TSNE(n_components=2, random_state=2024, perplexity=10, learning_rate=10)
word_vectors_drug_embedded = tsne.fit_transform(word_vectors_drug)

fig = go.Figure()

fig.add_trace(go.Scatter(x=word_vectors_drug_embedded[:,0], y=word_vectors_drug_embedded[:,1],
                    mode='markers',
                    text=list(keyword_vectors_drug_filtered.keys())))
# label points
for i, word in enumerate(list(keyword_vectors_drug_filtered.keys())):
    fig.add_annotation(x=word_vectors_drug_embedded[i,0], y=word_vectors_drug_embedded[i,1],
                       text=word, showarrow=False)
# plot size
fig.update_layout(width=1200, height=800)
fig.update_layout(title='t-SNE of Word Vectors (Drug)')
fig.show()

---
EMOTION ANALYSIS - SEE `emotion analysis.ipynb`

In [None]:
# emotion detection
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import numpy as np

# tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", model_max_length=512)
# model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

# classifier = pipeline(task = "text-classification",
#                       model=model,
#                       tokenizer = tokenizer,
#                       top_k = None)


In [None]:
# join tokens and list
# non_drug_posts = e_non_drug_posts['tokens'].apply(lambda x: ' '.join(x)).tolist()
# drug_posts = e_drug_posts['tokens'].apply(lambda x: ' '.join(x)).tolist()

In [None]:
# x = classifier(drug_posts[0])
# # unnest
# x[0][0:3]

In [None]:
# def get_top_3_emotions(post):
#   # Truncate the post if its length exceeds the maximum length
#     if len(post) > 512:
#         post = post[:512]

#     predictions = classifier(post)

#     top_3_emotions = []
#     for post_predictions in predictions:
#         # Sort predictions based on scores and get the top 3
#         top_predictions = sorted(post_predictions, key=lambda x: x['score'], reverse=True)[:3]
#         top_3_emotions.append(top_predictions)

#     return top_3_emotions

In [None]:
blah = "this is a sample. sentence"
# num words
len(blah.split())

5

In [None]:
# get_top_3_emotions(drug_posts[0])

In [None]:
# emotion_drug_posts = []
# emotion_non_drug_posts = []

# for post in drug_posts:
#     emotion_drug_posts.append(get_top_3_emotions(post))

# print("done")

In [None]:
# problem_posts = []
# for post in non_drug_posts:
#   try:
#     emotion_non_drug_posts.append(get_top_3_emotions(post))
#   except:
#     problem_posts.append(post)
#     continue

In [None]:
# 27,629 out of 28,301 posts were labeled for emotion
# finish the ones that remain
# missing_posts = non_drug_posts[27629:]
# problem_posts = []
# for post in missing_posts:
#   try:
#     emotion_non_drug_posts.append(get_top_3_emotions(post))
#   except:
#     problem_posts.append(post)
#     continue

In [None]:
# print(problem_posts[0])

In [None]:
# save
# import pickle
# pickle.dump(emotion_drug_posts, open('/content/drive/MyDrive/Colab_Notebooks/euphoria/emotion_drug_posts.pkl', 'wb'))
# pickle.dump(emotion_non_drug_posts, open('/content/drive/MyDrive/Colab_Notebooks/euphoria/emotion_non_drug_posts.pkl', 'wb'))

In [None]:
# get the freqs of top emotion classes
# drug
# drug_emo_freqs = {}
# for pred in emotion_drug_posts:
#   for emotion in pred:
#     # get first emotion
#     emotion = emotion[0]['label']
#     if emotion not in drug_emo_freqs:
#       drug_emo_freqs[emotion] = 1
#     else:
#       drug_emo_freqs[emotion] += 1


In [None]:
# plot freqs
# import plotly.express as px
# drug_emo_freqs = {k: v for k, v in sorted(drug_emo_freqs.items(), key=lambda item: item[1], reverse=True)}
# drug_emo_freqs = pd.DataFrame(drug_emo_freqs.items(), columns=['Emotion', 'Count'])

# fig = px.bar(drug_emo_freqs, x="Emotion", y="Count")
# fig.update_layout(title_text="Drug posts Emotion Predictions")
# fig.show()

In [None]:
# non-drug
# non_drug_emo_freqs = {}
# for pred in emotion_non_drug_posts:
#   for emotion in pred:
#     # get first emotion
#     emotion = emotion[0]['label']
#     if emotion not in non_drug_emo_freqs:
#       non_drug_emo_freqs[emotion] = 1
#     else:
#       non_drug_emo_freqs[emotion] += 1

# # plot
# non_drug_emo_freqs = {k: v for k, v in sorted(non_drug_emo_freqs.items(), key=lambda item: item[1], reverse=True)}
# non_drug_emo_freqs = pd.DataFrame(non_drug_emo_freqs.items(), columns=['Emotion', 'Count'])

# fig = px.bar(non_drug_emo_freqs, x="Emotion", y="Count")
# # title
# fig.update_layout(title_text="Non-drug posts Emotion Predictions")
# fig.show()

---
JUNK

In [None]:
# def get_top_3_emotions(posts):
#     processed_posts = []
#     for post in posts:
#         # Tokenize and truncate posts longer than 512 tokens
#         inputs = tokenizer.encode(post, max_length=512, truncation=True, return_tensors="pt")
#         print(len(inputs))
#         # Decode tokens back to text
#         truncated_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
#         processed_posts.append(truncated_text)

#     predictions = classifier(processed_posts)
#     print(len(predictions))
#     top_3_emotions = []
#     for post_predictions in predictions:
#         # Sort predictions based on scores and get the top 3
#         top_predictions = sorted(post_predictions, key=lambda x: x['score'], reverse=True)[:3]
#         top_3_emotions.append(top_predictions)

#     return top_3_emotions

In [None]:
# y = get_top_3_emotions(drug_posts[0])