Descriptions are retrieved using the HEART JS app: https://github.com/fbarzin/HEART

In [54]:
import pandas as pd
import pickle
# NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
# stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english') 
# simple preprocessing
import re

# read in descriptions of the communities
# descriptions from prior run - breaking into batches bc of api limitations
desc = pd.read_json("./data/nodes_desc.json")
desc2 = pd.read_json("./data/about.json")
lsd = pickle.load(open("./data/lsd_centrality.pkl", "rb"))
opiates = pickle.load(open("./data/opiates_centrality.pkl", "rb"))
benzo = pickle.load(open("./data/benzo_centrality.pkl", "rb"))
cocaine = pickle.load(open("./data/cocaine_centrality.pkl", "rb"))

In [55]:
# join descriptions
desc = pd.concat([desc, desc2], axis=0)

In [56]:
# merge descriptions with centrality
lsd['desc'] = lsd.merge(desc, left_on='node', right_on='name')['description']
opiates['desc'] = opiates.merge(desc, left_on='node', right_on='name')['description']
benzo['desc'] = benzo.merge(desc, left_on='node', right_on='name')['description']
cocaine['desc'] = cocaine.merge(desc, left_on='node', right_on='name')['description']

In [57]:
# get the nodes that are missing description
lsd_missing = lsd[lsd['desc'].isna()]['node'].tolist()
opiates_missing = opiates[opiates['desc'].isna()]['node'].tolist()
benzo_missing = benzo[benzo['desc'].isna()]['node'].tolist()
cocaine_missing = cocaine[cocaine['desc'].isna()]['node'].tolist()

In [58]:
# add stop words
extra_words = ['read', 'rules', 'community', 'subreddit', 'discussion', 'discuss', 'reddit']
stop_words.extend(extra_words)

In [61]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def get_topics(df):
    for community in df["community"].unique():
        print("community: ", community)
        # get descriptions
        descs = df[df["community"] == community]["desc"].tolist()
        # drop NaNs
        descs = [desc for desc in descs if str(desc) != 'nan']
        descs = [preprocess(desc) for desc in descs]
        # vectorize
        vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range= (1,1), max_df=0.95, min_df=2)
        X = vectorizer.fit_transform(descs)
        # get topics
        nmf = NMF(n_components=5, random_state=1, l1_ratio=.5, init='nndsvd').fit(X)
    #     nmf = NMF(
    #     n_components=5,
    #     kappa=.1,
    #     minimum_probability=0.01,
    #     w_max_iter=300,
    #     w_stop_condition=0.0001,
    #     h_max_iter=100,
    #     h_stop_condition=0.001,
    #     eval_every=10,
    #     normalize=True,
    #     random_state=42
    # )
        # nmf.fit(X)
        for topic_idx, topic in enumerate(nmf.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([vectorizer.get_feature_names()[i]
                            for i in topic.argsort()[:-10 - 1:-1]]))
        print(" ")
    

        


In [62]:
get_topics(lsd)

community:  1
Topic #0:
ask questions place get advice personal answer question post drugs
Topic #1:
welcome official harm reduction australia ents use safe come needs
Topic #2:
things related dedicated everything news interesting world dark home discussions
Topic #3:
memes amp formula kinds wars star playstation enjoy discord welcome
Topic #4:
art music post artists share discord genre trippy mature fractal
 
community:  7
Topic #0:
questions ask advice get help place use please harm support
Topic #1:
things place make go little others share uplifting pictures bunnies
Topic #2:
news creed assassin ubisoft franchise articles world share events united
Topic #3:
welcome people fans please know worst everyone kind aspects post
Topic #4:
cannabis florida anything everything medical laws mmtcs patients doctors products
 
community:  4
Topic #0:
place pictures related anything share everything topics music best growing
Topic #1:
dedicated universe series amp one sub game way world pc
Topic #



Topic #0:
place share new old discover experiences learn music ask dark
Topic #1:
people world friendly sharing images woman sad rule welcome never
Topic #2:
fans series art fan content feel unofficial memes free post
Topic #3:
related things welcome official post enjoy memes news images free
Topic #4:
everything dedicated else relating consumption including th use kids discussions
 
community:  0
Topic #0:
related place anything everything welcome share game posts acid mental
Topic #1:
things related pins well garmin amp news official discussions great
Topic #2:
please us join advice help sub share must welcome post
Topic #3:
music dedicated new culture news uk rap energetic apple favorite
Topic #4:
questions ask art thread post everything people general weekly daily
 
community:  8
Topic #0:
place share questions ask content welcome music advice help people
Topic #1:
dedicated sub everything study show sex dark ring elden together
Topic #2:
things related home anything video games pl



In [63]:
get_topics(opiates)

community:  0
Topic #0:
related welcome please posting anything news everything posts place cannabis
Topic #1:
ask questions place drugs answer thought provoking askreddit great get
Topic #2:
things festivals make insurance music interesting art live trip psychedelic
Topic #3:
harm dedicated reduction use safe strictly sourcing people molly drug
Topic #4:
discord gg https users come chat bonzos join benzo com
 
community:  3
Topic #0:
ask questions self related setup thread equipment record stickied weekly
Topic #1:
welcome hunting things issues post disorders related high encouraged culture
Topic #2:
anything nicotine everything mods related amp post cannabis place find
Topic #3:
news killers serial information internet art discussions film including regarding
Topic #4:
people place please drug share help dedicated com life experiences
 
community:  2
Topic #0:
welcome post questions related share place music advice people amp
Topic #1:
stuff intended place recreational using topics i



In [64]:
get_topics(benzo)

community:  5
Topic #0:
questions ask place drugs answer askreddit legal medical share simple
Topic #1:
welcome ents please australia hunting share enjoy participating sourcing join
Topic #2:
related things harm place reduction people dedicated advice safe help
Topic #3:
everything anything cannabis psychedelic ever news electronic related music art
Topic #4:
official meo dmt page dxm dissociative updates powerful culture hub
 
community:  1
Topic #0:
harm reduction sourcing safe things otherwise dedicated note discussions promote
Topic #1:
struggling peer anyone support suicidal thoughts disorder health information thc
Topic #2:
ask advice place questions medical answer great get professional run
Topic #3:
welcome related topics posts disorders discussing use banned anything feel
Topic #4:
methadone suboxone ambien post starting treatment sharing experiences information questions
 
community:  2
Topic #0:
welcome related questions post ask sub anything feel free etc
Topic #1:
things f



In [65]:
get_topics(cocaine)

community:  0
Topic #0:
advice please people dedicated harm sub reduction discord posting medical
Topic #1:
things related interesting make psychedelic place go penetrated animals discussions
Topic #2:
questions ask place share dark get answer askreddit mental away
Topic #3:
everything anything related news cannabis dedicated ever hip hop casual
Topic #4:
welcome tattoos life post free aspects fans feel share unrestricted
 
community:  1
Topic #0:
questions ask place people please share discussions com get amp
Topic #1:
welcome hair beauty support best wireless pc post online reduction
Topic #2:
news pc around world gaming new current audio us articles
Topic #3:
related everything place music electronic anything cannabis dedicated topics programming
Topic #4:
developed game published focused first activision duty shooter recognized call
 
community:  2
Topic #0:
questions ask anything legal related career answer welcome forum help
Topic #1:
dedicated marvel safe counter fans strictly u

