In [1]:
import pandas as pd
import re
import networkx as nx
import random

In [2]:
from gensim.models import word2vec
import numpy as np
from gensim.models import Phrases
import nltk
import nltk.data

In [3]:
from snapy import MinHash, LSH

### Data

In [None]:
mesh=pd.read_csv('MESH.csv',usecols=['Preferred Label'])
mesh_ls = list(set(mesh['Preferred Label']))
mesh_terms = list(set([i.lower() for i in mesh_ls]))

In [None]:
text = 'reddit_data_names.txt'
filenames = open(text, 'r').read().split()

all_text = []
author_ids = []
for t in filenames:
    filepath = 'data/' + t + '.json'
    df = pd.read_json(filepath)
    all_text.extend(list(df['text']))
    author_ids.extend(list(df.index))

In [None]:
phrase_file1 = 'data/AutoPhrase_single-word.txt'
i = 0
autophrase_single = []
for r in open(phrase_file1, 'r').read().split():
    if i%2 == 1:
        autophrase_single.append(r)
    i += 1

phrase_file2 = 'data/AutoPhrase_multi-words.txt'
i = 0
autophrase_multi = []
for r in open(phrase_file2, 'r').read().split():
    if i%2 == 1:
        autophrase_multi.append(r)
    i += 1

### Locality Similarity Hash

In [None]:
def make_content_all(search_terms, text, authors):
    #Initialize a dictionary
    content = dict()
    
    #Add search terms to dictionary with numbered key
    for i in range(len(search_terms)):
        if len(search_terms[i]) > 3:
            content[i] = search_terms[i].strip().lower()
                
    #Add text to dictionary with author as key
    for i in range(len(text)):
        key_n = 0
        for sentence in text[i].split():
            s=sentence.replace(',',' ').strip().lower()
            if len(s) > 5:
                key = authors[i] + "_" + str(key_n)
                content[key] = s
                key_n += 1
    return content

In [28]:
def create_lsh(content, n_permutations, n_gram):
    labels = content.keys()
    values = content.values()
    #Create MinHash object
    minhash = MinHash(values, n_gram=n_gram, permutations=n_permutations, hash_bits=64, seed=3)
    
    #Create LSH model
    lsh = LSH(minhash, labels,no_of_bands=n_permutations//2)
    del minhash
    return lsh


def add_similar_edges(graph,lsh, content, minjaccard):
    #For all items in content
    ls = {}
    for index, text in content.items():
        
        #If item is one of the search terms
        if type(index) == int:
            q = lsh.query(index, min_jaccard=minjaccard)
            
            #For all matches found
            for match in q:
                
                #If matched item is from the reddit text
                if type(match) != int:
                    author = match.split("_")[0]
                    term = content[match].strip('"').strip('.').strip('?')
                    #add term that are found in the reddit text to list
                    if author not in ls.keys():
                        ls[author] = [term]
                    else:
                        ls[author].append(term)
    for i in ls:
        ls[i]=list(set(ls[i]))
        total=len(ls[i])
        for n in range(total):
            for j in list(range(total))[n+1:]:
                graph.add_edge(ls[i][n],ls[i][j],author_id=i,)

### Word2Vec Model

In [19]:
def text_to_list (text):
    cleaned_text = re.sub("[^a-zA-Z]", " ", text)
    words = cleaned_text.lower().split()
    
    return words

def text_to_sentences (text, tokenizer):
    raw_sentences = tokenizer.tokenize(text.strip())
    
    sentences=[]
    for s in raw_sentences:
        if len(s) > 0:
            sentences.append(text_to_list(s))
    return sentences

### Main Code

In [32]:
#Necessary variables
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
num_feat = 100
num_neighbors = 5
model_name = "wv_reddit"


#Cleaning the text of the reddit data to a format that can be used in Word2Vec
sentences = []
for text in all_text:
    sentences += text_to_sentences(text, tokenizer)

#Building model
model = word2vec.Word2Vec(sentences, size=num_feat, window=num_neighbors, min_count=1)
model.save(model_name)

#Including bigrams to model
bigram = Phrases(sentences)
model = word2vec.Word2Vec(bigram[sentences], min_count=1)

#Training model on MESH terms
model.train([mesh_terms], total_examples=1, epochs=1)


In [9]:
#Defining a short list of terms from MESH.csv
short_mesh = mesh_terms[:len(mesh_terms)//5]

#Formatting the data to create LSH
content = make_content_all(short_mesh, all_text, author_ids)

In [None]:
#Creating an LSH model
lsh = create_lsh(content, n_permutations=100, n_gram=4)

In [None]:
#Going through queries, printing all mapped similar words,
#and the similarity based on Word2Vec model
for i in content:
    for n in lsh.query(i,min_jaccard=0.2):
        if (type(i)==int) & (type(n)!=int):
            term = content[n].strip('.').strip('?').strip('"')
            if (term in model.wv.vocab) & (content[i] in model.wv.vocab):
                print(content[i],' ------> ', term)
                print(model.wv.similarity(content[i], term))

In [50]:
#Building a graph
opioid_drugs = nx.DiGraph()
add_similar_edges(opioid_drugs, lsh, content, 0.2)

In [51]:
#Printing all edges with the author id
for i in opioid_drugs.edges:
    print(i,opioid_drugs.edges[i]['author_id'])

('contact', 'stereotypical') gnjn1p
('contact', 'lawyer') gnjn1p
('contact', 'reduction') gnjn1p
('contact', 'porcelain') gnjn1p
('contact', 'information') gnjn1p
('contact', 'keeping') gnjn1p
('contact', 'injection') gnjn1p
('contact', 'diseases') gnjn1p
('contact', 'profiling') gnjn1p
('contact', 'question') gnjn1p
('contact', 'person') es0cs8
('contact', 'exchange') gnjn1p
('contact', 'financially') es0cs8
('contact', 'social') es0cs8
('contact', 'collected') es0cs8
('contact', 'substances') es0cs8
('contact', 'attribute') es0cs8
('contact', 'heroin') iidqgu
('contact', 'shoulders') iidqgu
('contact', 'fingers') iidqgu
('contact', 'physical') iidqgu
('contact', 'living') iidqgu
('contact', 'smoking') iidqgu
('contact', 'parents') iidqgu
('contact', 'substance') iidqgu
('contact', 'cocaine') iidqgu
('contact', 'sweat') iidqgu
('contact', 'string') iidqgu
('contact', 'another') iidqgu
('contact', 'asleep') iidqgu
('contact', 'animals') e3dwnz
('contact', 'practitioner') e3dwnz
('conta

('blood', 'development') i2k0ab
('blood', 'psychologically') i2k0ab
('blood', 'recreational') i2k0ab
('blood', 'asleep') e427qz
('physical', 'vision') i2k0ab
('physical', 'background') i2k0ab
('physical', 'diacetylmorphine') i2k0ab
('physical', 'production') i2k0ab
('physical', 'morphine') i2k0ab
('physical', "heroin's") i2k0ab
('physical', 'reading') etxns5
('physical', 'realization') i2k0ab
('physical', 'survival') i2k0ab
('physical', 'nausea') gmcq4n
('physical', 'jading') i2k0ab
('physical', 'sickness') i2k0ab
('physical', 'congestion') i2k0ab
('physical', 'addiction') i2k0ab
('physical', 'concentration') i2k0ab
('physical', 'compounds') i2k0ab
('physical', 'effect') i2k0ab
('physical', 'diarrhea') i2k0ab
('physical', 'psychological') i2k0ab
('physical', 'sleeping') i2k0ab
('physical', 'person') i2k0ab
('physical', "person's") i2k0ab
('physical', 'respiratory') i2k0ab
('physical', 'injuries') i2k0ab
('physical', 'analgesic') i2k0ab
('physical', 'gastrointestinal') i2k0ab
('physical

('smoking', 'parents') iidqgu
('smoking', 'substance') iidqgu
('smoking', 'condition') gydy9x
('smoking', 'clarification') gydy9x
('smoking', 'nicotine') ev9s9x
('smoking', 'cocaine') iidqgu
('smoking', 'chemicals') j3htdm
('smoking', 'reading') j3htdm
('smoking', 'adduction') j3htdm
('smoking', 'addiction') geb451
('smoking', 'community') i138nq
('smoking', 'productive') i138nq
('smoking', 'should') iqkus4
('smoking', 'pretend') i138nq
('smoking', 'personality') i138nq
('smoking', 'person') ev9s9x
('smoking', 'effect') iqkus4
('smoking', 'diarrhea') gid5ro
('smoking', 'improve') gid5ro
('smoking', 'recreationally') gid5ro
('smoking', 'distance') gid5ro
('smoking', 'asleep') iidqgu
('smoking', 'movement') gid5ro
('smoking', 'thefts') ev9s9x
('smoking', 'better') ev9s9x
('smoking', 'coming') ev9s9x
('smoking', 'security') ev9s9x
('smoking', 'solution') ev9s9x
('smoking', 'sweat') iidqgu
('smoking', 'string') iidqgu
('smoking', 'killing') ebh2u0
('smoking', 'cigarettes') ebh2u0
('smoking