In [1]:
import numpy as np
import pandas as pd
import py_stringmatching as sm
import py_stringsimjoin as sj
import py_entitymatching as em

from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import brown

ModuleNotFoundError: No module named 'py_stringmatching'

# Drugs DF

In [None]:
DRUG_NAMES = './data/cleaned_files/drug_names_fda_VA_filled.csv'

In [3]:
# Drugs DataFrame
drugs_df = pd.read_csv(DRUG_NAMES)
drugs_df['NAME'] = drugs_df['NAME'].str.lower()
drugs_df['GENERIC_NAME'] = drugs_df['GENERIC_NAME'].str.lower()

In [4]:
!head -n 2 $DRUG_NAMES

NAME,GENERIC_NAME,NAME_TYPE,FULL_GENERIC_NAME,FDA_SCHEDULE,VA_CLASS,VA_CATEGORY,CATEGORY
FENTANYL,FENTANYL,GENERIC_NAME,FENTANYL,SCHEDULE II,CN101,OPIOID ANALGESICS,OPIOID ANALGESICS


In [5]:
# Alternative names
alt_names = defaultdict(set)
with open(DRUG_NAMES, 'r') as f:
    header = True
    for line in f:
        if header:
            header = False
            continue
            
        line = line.lower().split(',')
        name = line[0]
        gen_name = line[1]
        
        alt_names[gen_name].add(name)
        
for k, v in alt_names.items():
    alt_names[k] = ' '.join(v)

In [6]:
drugs_df['ALTNAMES'] = drugs_df.apply(lambda x: alt_names[x['GENERIC_NAME']], axis=1)
drugs_df = drugs_df.rename(columns={'NAME_TYPE': 'NAMETYPE'})

In [7]:
drugs_df.head()

Unnamed: 0,NAME,GENERIC_NAME,NAMETYPE,FULL_GENERIC_NAME,FDA_SCHEDULE,VA_CLASS,VA_CATEGORY,CATEGORY,ALTNAMES
0,fentanyl,fentanyl,GENERIC_NAME,FENTANYL,SCHEDULE II,CN101,OPIOID ANALGESICS,OPIOID ANALGESICS,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...
1,apache,fentanyl,STREET_NAME,FENTANYL,SCHEDULE II,CN101,OPIOID ANALGESICS,OPIOID ANALGESICS,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...
2,birria,fentanyl,STREET_NAME,FENTANYL,SCHEDULE II,CN101,OPIOID ANALGESICS,OPIOID ANALGESICS,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...
3,coca,coca,GENERIC_NAME,COCA,SCHEDULE II,CN101,OPIOID ANALGESICS,OPIOID ANALGESICS,coca
4,blonde,fentanyl,STREET_NAME,FENTANYL,SCHEDULE II,CN101,OPIOID ANALGESICS,OPIOID ANALGESICS,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...


In [8]:
drugs_df.shape

(2636, 9)

## Stop words and common words

In [9]:
# Init tokenizer and nltk stopwords
tok = sm.AlphanumericTokenizer(return_set=True)
stop_words = set(stopwords.words('english')) 

In [10]:
# These overlap with some of the street names, removing
stop_words.add('drug')
stop_words.add('drugs')

In [11]:
# Common English words from Brown corpus
brown_counter = Counter()
brown_corpus = [w.lower() for w in brown.words()]
brown_words = [w for w in brown_corpus if w not in stop_words]
brown_counter.update(brown_words)

In [12]:
# Check to see which words are common
remove_count = []
for name in drugs_df['NAME'].unique():
    if name in brown_counter and brown_counter[name] >= 15:
        remove_count.append((name, brown_counter[name]))

In [13]:
sorted(remove_count, key = lambda x: x[1], reverse=True)

[('work', 762),
 ('day', 687),
 ('go', 626),
 ('water', 445),
 ('night', 411),
 ('white', 365),
 ('case', 362),
 ('light', 333),
 ('line', 298),
 ('act', 283),
 ('car', 274),
 ('special', 250),
 ('boy', 242),
 ('board', 239),
 ('love', 231),
 ('girl', 220),
 ('clear', 219),
 ('mother', 216),
 ('black', 203),
 ('hard', 202),
 ('peace', 198),
 ('table', 198),
 ('personal', 196),
 ('fire', 187),
 ('dark', 185),
 ('space', 184),
 ('brown', 176),
 ('material', 174),
 ('cold', 171),
 ('food', 147),
 ('blue', 143),
 ('girls', 142),
 ('method', 142),
 ('normal', 136),
 ('friend', 133),
 ('plant', 125),
 ('running', 123),
 ('horse', 117),
 ('green', 116),
 ('cars', 112),
 ('heavy', 110),
 ('teeth', 103),
 ('trees', 101),
 ('doctor', 100),
 ('glass', 99),
 ('7', 94),
 ('base', 91),
 ('henry', 83),
 ('speed', 83),
 ('lady', 80),
 ('coffee', 78),
 ('rock', 75),
 ('dust', 70),
 ('china', 69),
 ('issues', 66),
 ('dream', 64),
 ('domestic', 63),
 ('baby', 62),
 ('extreme', 62),
 ('soft', 61),
 ('beac

In [14]:
# Remove common words and stop words
remove = []
for name in drugs_df['NAME'].unique():
    if (name in brown_counter and brown_counter[name] >= 15) or (name in stop_words):
        remove.append(name)

In [15]:
drugs_df = drugs_df[~drugs_df['NAME'].isin(remove)].copy().reset_index(drop=True)

In [16]:
drugs_df.shape

(2370, 9)

# Process Reddit data

In [17]:
REDDIT_DATA = './data/cleaned_files/reddit_clean.csv'

In [18]:
reddit_df = pd.read_csv(REDDIT_DATA)

In [19]:
reddit_df.head()

Unnamed: 0,author,created,link_flair_text,id,num_comments,num_crossposts,score,selftext,title,total_awards_received,upvote_ratio,Week
0,Machinexa2,1592838000.0,I :love: Drugs,hdn9ur,0,0,1,I have stopped use of edibles for more than 2 months due to corona. However the shop was open ye...,Cannabis + Betel/Areca nut + Bacopa Monnieri = AWESOME,0,1.0,0
1,ForthName,1592838000.0,,hdn7sj,17,0,1,I’m a moderately experienced trooper who occasionally trips with friends \n\nOn 2 seperate occas...,Do not sleep on acid,0,1.0,0
2,oihavequestions,1592837000.0,,hdn5nl,1,0,1,So HYPOTHETICALLY lets say someone has a prescription for adderal which requires them to take a ...,Will adderal help someone pass a drug test?,0,1.0,0
3,xxxPeniBanini,1592837000.0,Stimulants,hdn2dx,1,0,1,I am looking to try out METHYLPHENIDATE (biphentin) what would be dosages for different plateau...,NEED HELP WITH METHYLPHENIDATE,0,1.0,0
4,SnooPets8599,1592837000.0,,hdmzdg,1,0,1,"Best ways to send drugs/ packages through customs \n MDMA, METH\n\nWOULD APPRECIATE SOME ADVICE ...",How to package drugs and send to another country,0,1.0,0


In [20]:
reddit_df = reddit_df.rename(columns={'id': 'reddit_id'})

In [21]:
reddit_df.shape

(11589, 12)

In [22]:
# Remove unpopular posts
reddit_df = reddit_df[reddit_df['score'] > 0].copy().reset_index(drop=True)
reddit_df.shape

(11229, 12)

## Stop words

In [23]:
def process_text(text, tok, stop_words, join=False):
    text = text.strip().lower().replace('\\n\\n', ' ')
    result = [w for w in tok.tokenize(text) if w not in stop_words]
    if join:
        result = ' '.join(result)
    return result

In [24]:
# Tokenize and remove stopwords from selftext and titles
reddit_df['tokTitle'] = reddit_df.apply(lambda x: process_text(x['title'], tok, stop_words), axis=1)
reddit_df['tokSelftext'] = reddit_df.apply(lambda x: process_text(x['selftext'], tok, stop_words), axis=1)

In [25]:
# Just to check the frequency of words in reddit text
reddit_counter = Counter()
reddit_df['tokTitle'].apply(reddit_counter.update)
reddit_df['tokSelftext'].apply(reddit_counter.update)
print(reddit_counter)



In [26]:
# Join text
reddit_df['tokTitle'] = reddit_df.apply(lambda x: process_text(x['title'], tok, stop_words, join=True), axis=1)
reddit_df['tokSelftext'] = reddit_df.apply(lambda x: process_text(x['selftext'], tok, stop_words, join=True), axis=1)

# Entity Matching - drugs_df

In [27]:
l_output_attrs = ['NAME', 'GENERIC_NAME', 'NAMETYPE', 'ALTNAMES']
r_output_attrs = ['tokTitle', 'tokSelftext']

In [28]:
A = drugs_df
B = reddit_df

In [29]:
A['id'] = A.index
B['id'] = B.index

In [30]:
em.set_key(A, 'id')
em.set_key(B, 'id')

True

In [31]:
A['ALTNAMES'] = A['ALTNAMES'].astype(str)

In [32]:
atypes_A = em.get_attr_types(A)
atypes_B = em.get_attr_types(B)

In [33]:
[(col, atypes_A[col]) for col in A.columns]

[('NAME', 'str_bt_1w_5w'),
 ('GENERIC_NAME', 'str_bt_1w_5w'),
 ('NAMETYPE', 'str_eq_1w'),
 ('FULL_GENERIC_NAME', 'str_bt_1w_5w'),
 ('FDA_SCHEDULE', 'str_bt_1w_5w'),
 ('VA_CLASS', 'str_eq_1w'),
 ('VA_CATEGORY', 'str_bt_1w_5w'),
 ('CATEGORY', 'str_bt_1w_5w'),
 ('ALTNAMES', 'str_gt_10w'),
 ('id', 'numeric')]

In [34]:
[(col, atypes_B[col]) for col in B.columns]

[('author', 'str_eq_1w'),
 ('created', 'numeric'),
 ('link_flair_text', 'str_bt_1w_5w'),
 ('reddit_id', 'str_eq_1w'),
 ('num_comments', 'numeric'),
 ('num_crossposts', 'numeric'),
 ('score', 'numeric'),
 ('selftext', 'str_gt_10w'),
 ('title', 'str_bt_5w_10w'),
 ('total_awards_received', 'numeric'),
 ('upvote_ratio', 'numeric'),
 ('Week', 'numeric'),
 ('tokTitle', 'str_bt_1w_5w'),
 ('tokSelftext', 'str_gt_10w'),
 ('id', 'numeric')]

In [35]:
em.get_attr_corres(A, B)['corres']

[('id', 'id')]

In [36]:
atypes_A['NAME'] = atypes_B['tokSelftext']
atypes_A['GENERIC_NAME'] = atypes_B['tokSelftext']
atypes_B['tokTitle'] = atypes_B['tokSelftext']

In [37]:
block_c = em.get_attr_corres(A, B)
block_c['corres'] = [('NAME', 'tokTitle'), ('NAME', 'tokSelftext')]

In [38]:
toks = em.get_tokenizers_for_blocking()
toks

{'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>,
 'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>,
 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>,
 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>}

In [39]:
sim = em.get_sim_funs_for_blocking()
sim

{'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'cosine'

In [40]:
block_f = em.get_features(A, B, atypes_A, atypes_B, block_c, toks, sim)

In [41]:
block_f

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,NAME_tokTitle_jac_qgm_3_qgm_3,NAME,tokTitle,qgm_3,qgm_3,jaccard,<function NAME_tokTitle_jac_qgm_3_qgm_3 at 0x7f4069a7c4d0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,NAME_tokTitle_cos_dlm_dc0_dlm_dc0,NAME,tokTitle,dlm_dc0,dlm_dc0,cosine,<function NAME_tokTitle_cos_dlm_dc0_dlm_dc0 at 0x7f40756f7dd0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,NAME_tokSelftext_jac_qgm_3_qgm_3,NAME,tokSelftext,qgm_3,qgm_3,jaccard,<function NAME_tokSelftext_jac_qgm_3_qgm_3 at 0x7f406ac805f0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,NAME_tokSelftext_cos_dlm_dc0_dlm_dc0,NAME,tokSelftext,dlm_dc0,dlm_dc0,cosine,<function NAME_tokSelftext_cos_dlm_dc0_dlm_dc0 at 0x7f406ac80290>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [42]:
# Black box function that checks if NAME is exactly in the reddit text
def street_name_titleselftext_exact(x, y):
    x = x['NAME']
    y = y['tokTitle'] + ' ' + y['tokSelftext']
    return not x in y

In [43]:
# Blocker to match NAMES and reddit text
# This blocker gives us all of the candidate relationships, which are a lot since there may be multiple 
# relationships with a single Reddit post
bb = em.BlackBoxBlocker()
bb.set_black_box_function(street_name_titleselftext_exact)

In [44]:
C1 = bb.block_tables(A, B, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, show_progress=True,
                     n_jobs=-1)

In [45]:
C1

Unnamed: 0,_id,ltable_id,rtable_id,ltable_NAME,ltable_GENERIC_NAME,ltable_NAMETYPE,ltable_ALTNAMES,rtable_tokTitle,rtable_tokSelftext
0,0,0,10,fentanyl,fentanyl,GENERIC_NAME,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...,official weekly free thread week 25 june 22 2020,welcome another edition weekly free newcomers thread allowed friendly conversations anything wit...
1,1,0,51,fentanyl,fentanyl,GENERIC_NAME,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...,overdose,heroin awhile say shit besides buy locally test via dancesafe strips proved good 10 years decide...
2,2,0,73,fentanyl,fentanyl,GENERIC_NAME,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...,use,got xanax looking order reagents needed test opioids amphetamines need check know heroin sure al...
3,3,0,117,fentanyl,fentanyl,GENERIC_NAME,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...,55 different time posting ranked 10,psychedelics lsd 25 10 lsm 775 6 al lad 9 eth 7 25b nbome mescaline 8 psilcybin 4 ho met dipt mt...
4,4,0,135,fentanyl,fentanyl,GENERIC_NAME,heineken actiq white ladies fire tango and cash fent food dragon’s breath onsolis apache blues f...,found list trying import sell fucking hell variety,idea accurately classed know tick ment obtained trying sell substance sure felt need much variet...
...,...,...,...,...,...,...,...,...,...
90377,90377,2369,9296,steroids,testosterone,STREET_NAME,juice arnolds steroids weight gainers testosterone stackers pumpers gym candy muscle builders roids,steroids,hey gonna sound like head ass say anyway friends weed mdma shit people school treat legends prai...
90378,90378,2369,10210,steroids,testosterone,STREET_NAME,juice arnolds steroids weight gainers testosterone stackers pumpers gym candy muscle builders roids,peggy dr house figure,try make briefvs possible substance abuser benzos kinds stimulants steroids methamphetamines typ...
90379,90379,2369,10957,steroids,testosterone,STREET_NAME,juice arnolds steroids weight gainers testosterone stackers pumpers gym candy muscle builders roids,advice needed mixing testosterone part gender transition,hi 21 ftm transgender guy living uk several years waiting finally first appointment yesterday cl...
90380,90380,2369,11029,steroids,testosterone,STREET_NAME,juice arnolds steroids weight gainers testosterone stackers pumpers gym candy muscle builders roids,currently taking prednisone given new anti depressant questions,prednisone steroids 11 years mainly currently 55mg colitis time around declining mentally attitu...


In [46]:
# Check to see what didn't get matched
missing_reddit = ~reddit_df['id'].isin(C1['rtable_id'])

In [47]:
reddit_df[missing_reddit]['tokSelftext']

46                                   never high nitrous curious one would even worth
48                                                                                  
54                                                    sober 10 years always somthing
174                                                              way get high months
198                                                       4 https discord gg fswhxuj
                                            ...                                     
9981                                                    post 4 year old burner phone
10075        tried first time normally get mosa would 100 say best nos chargers ever
10222         possible overdose buspirone bespar buspar remember exactly yes dosages
10538    difference gabapentin pregabalin one stronger effects different last longer
10587               mix significant amount ativan alcohol result painless discomfort
Name: tokSelftext, Length: 80, dtype: object

In [None]:
# Blocker to get a subset of the above for manual labeling
rb = em.RuleBasedBlocker()
rb.add_rule(['NAME_tokSelftext_cos_dlm_dc0_dlm_dc0 	(ltuple, rtuple) < 0.3'], block_f)

In [None]:
C2 = rb.block_candset(C1, n_jobs=-1)

In [None]:
C2

# Label Block

In [None]:
# Generate manually labeled data set

#G = em.label_table(C2, 'gold_label')
#G.to_csv('./C2_labeled.csv', index=False)

In [None]:
#G = em.read_csv_metadata('./C2_labeled.csv', key='_id',
#                      fk_ltable='ltable_id', fk_rtable='rtable_id',
#                      ltable=A, rtable=B)

In [None]:
# I needed to add features that weren't included in the labeled data set

#G.merge(A[['id', 'ALTNAMES']], how='left', left_on='ltable_id', right_on='id').drop(columns=['id']).to_csv('./C2_labeled_2.csv', index=False)
#G.merge(A[['id', 'NAMETYPE']], how='left', left_on='ltable_id', right_on='id').drop(columns=['id']).to_csv('./C2_labeled_3.csv', index=False)

In [48]:
# Final labeled set with added features
G = em.read_csv_metadata('./C2_labeled_3.csv', key='_id',
                      fk_ltable='ltable_id', fk_rtable='rtable_id',
                      ltable=A, rtable=B)

Metadata file is not present in the given path; proceeding to read the csv file.


# Matching

In [49]:
# Split data into train and test
IJ = em.split_train_test(G, train_proportion=0.6, random_state=0)
I = IJ['train']
J = IJ['test']

In [50]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [51]:
A = A.rename(columns={'NAME_TYPE': 'NAMETYPE'})

In [52]:
A.columns

Index(['NAME', 'GENERIC_NAME', 'NAMETYPE', 'FULL_GENERIC_NAME', 'FDA_SCHEDULE',
       'VA_CLASS', 'VA_CATEGORY', 'CATEGORY', 'ALTNAMES', 'id'],
      dtype='object')

In [53]:
B.columns

Index(['author', 'created', 'link_flair_text', 'reddit_id', 'num_comments',
       'num_crossposts', 'score', 'selftext', 'title', 'total_awards_received',
       'upvote_ratio', 'Week', 'tokTitle', 'tokSelftext', 'id'],
      dtype='object')

In [54]:
A = A.drop(columns=['FULL_GENERIC_NAME', 'FDA_SCHEDULE', 'VA_CLASS', 'VA_CATEGORY', 'CATEGORY'])
B = B.drop(columns=['author', 'link_flair_text', 'num_comments', 'num_crossposts', 'score', 'selftext', 'title',
                    'total_awards_received', 'upvote_ratio', 'Week'])

In [55]:
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [56]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x7f4068b10d40>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x7f4068b10c20>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,id_id_lev_dist,id,id,,,lev_dist,<function id_id_lev_dist at 0x7f4068b10ef0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,id_id_lev_sim,id,id,,,lev_sim,<function id_id_lev_sim at 0x7f406a2ef050>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


## Generate Features

In [57]:
# Matching NAME to the Reddit text using string and token similarity doesn't really make sense since the
# Reddit text is long while the names are 1-2 words. There may be some value in similarity scores between all
# of the ALTNAMES and the Reddit text, especially in cases where multiple alt words are used in the text. However,
# the length of a post or a short list of alt names may arbitrarily lower these scores.

In [58]:
feature_string = '''dice(wspace(ltuple['ALTNAMES']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokSelftext'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'jac_ws_ALTNAMES_tokTitleSelftext', feature)

True

In [59]:
feature_string = '''dice(wspace(ltuple['ALTNAMES']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokSelftext'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'dc_ws_ALTNAMES_tokTitleSelftext', feature)

True

In [60]:
feature_string = '''cosine(wspace(ltuple['ALTNAMES']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokSelftext'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'cos_ws_ALTNAMES_tokTitleSelftext', feature)

True

In [61]:
# NAME in Reddit text
def street_name_titleselftext_exact_feature(x, y):
    x = x['NAME']
    y = y['tokTitle'] + ' ' + y['tokSelftext']
    return int(x in y)

In [62]:
em.add_blackbox_feature(F, 'street_name_present', street_name_titleselftext_exact_feature)

True

In [63]:
# Number of alt names in Reddit text
def num_alt_matches(x, y):
    x = set(x['ALTNAMES'].split(' '))
    y = set(y['tokTitle'].split(' ')).union(set(y['tokSelftext'].split(' ')))
    return len(x.intersection(y))

In [64]:
em.add_blackbox_feature(F, 'num_alt_matches', num_alt_matches)

True

In [65]:
# Name type is not street name (to give weight to brand and generic names)
def not_street_name(x, y):
    return int(not x['NAMETYPE'] == 'STREET_NAME')

In [66]:
em.add_blackbox_feature(F, 'not_street_name', not_street_name)

True

In [67]:
# Generic name is in Reddit text
def generic_in_text(x, y):
    x = x['GENERIC_NAME']
    y = y['tokTitle'] + ' ' + y['tokSelftext']
    return int(x in y)

In [68]:
em.add_blackbox_feature(F, 'generic_in_text', generic_in_text)

True

In [69]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x7f4068b10d40>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x7f4068b10c20>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,id_id_lev_dist,id,id,,,lev_dist,<function id_id_lev_dist at 0x7f4068b10ef0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,id_id_lev_sim,id,id,,,lev_sim,<function id_id_lev_sim at 0x7f406a2ef050>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,jac_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7f4068b18320>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
5,dc_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7f406a2cedd0>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
6,cos_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7f406935e290>,"def fn(ltuple, rtuple):\n return cosine(wspace(ltuple['ALTNAMES']),\n ...",False
7,street_name_present,,,,,,<function street_name_titleselftext_exact_feature at 0x7f406936f710>,,False
8,num_alt_matches,,,,,,<function num_alt_matches at 0x7f406933d200>,,False
9,not_street_name,,,,,,<function not_street_name at 0x7f40698ce4d0>,,False


In [70]:
# Remove id based features
F = F[~F['feature_name'].str.contains('id')].copy().reset_index(drop=True)

In [71]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,jac_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7f4068b18320>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
1,dc_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7f406a2cedd0>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
2,cos_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7f406935e290>,"def fn(ltuple, rtuple):\n return cosine(wspace(ltuple['ALTNAMES']),\n ...",False
3,street_name_present,,,,,,<function street_name_titleselftext_exact_feature at 0x7f406936f710>,,False
4,num_alt_matches,,,,,,<function num_alt_matches at 0x7f406933d200>,,False
5,not_street_name,,,,,,<function not_street_name at 0x7f40698ce4d0>,,False
6,generic_in_text,,,,,,<function generic_in_text at 0x7f406933ddd0>,,False


## Select Matcher

In [72]:
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)

In [73]:
H.head()

Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_ALTNAMES_tokTitleSelftext,dc_ws_ALTNAMES_tokTitleSelftext,cos_ws_ALTNAMES_tokTitleSelftext,street_name_present,num_alt_matches,not_street_name,generic_in_text,gold_label
68,10832,134,4016,0.033113,0.033113,0.065089,1,5,0,1,1
210,39484,1126,8721,0.065217,0.065217,0.086031,1,3,0,0,1
20,2807,368,2163,0.046512,0.046512,0.051848,1,1,1,0,1
310,51000,2196,652,0.111111,0.111111,0.111803,1,1,1,1,1
342,52955,1276,3532,0.011765,0.011765,0.029604,1,1,0,0,1


In [74]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
        k=5,
        target_attr='gold_label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.894418,0.920006,0.906513
1,RF,0.895865,0.941065,0.917707
2,SVM,0.833831,1.0,0.908721
3,LinReg,0.833831,1.0,0.908721
4,LogReg,0.833831,1.0,0.908721


In [75]:
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f40727d8d10>,5,0.875,0.882353,0.946429,0.866667,0.901639,0.894418
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f407178ae50>,5,0.896552,0.849057,0.948276,0.896552,0.888889,0.895865
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f40727d89d0>,5,0.835821,0.757576,0.878788,0.818182,0.878788,0.833831
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f4071cc51d0>,5,0.835821,0.757576,0.878788,0.818182,0.878788,0.833831
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f4071a0db10>,5,0.835821,0.757576,0.878788,0.818182,0.878788,0.833831


In [76]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f40727d8d10>,5,0.875,0.9,0.913793,0.962963,0.948276,0.920006
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f407178ae50>,5,0.928571,0.9,0.948276,0.962963,0.965517,0.941065
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f40727d89d0>,5,1.0,1.0,1.0,1.0,1.0,1.0
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f4071cc51d0>,5,1.0,1.0,1.0,1.0,1.0,1.0
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f4071a0db10>,5,1.0,1.0,1.0,1.0,1.0,1.0


In [77]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f40727d8d10>,5,0.875,0.891089,0.929825,0.912281,0.92437,0.906513
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f407178ae50>,5,0.912281,0.873786,0.948276,0.928571,0.92562,0.917707
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f40727d89d0>,5,0.910569,0.862069,0.935484,0.9,0.935484,0.908721
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f4071cc51d0>,5,0.910569,0.862069,0.935484,0.9,0.935484,0.908721
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f4071a0db10>,5,0.910569,0.862069,0.935484,0.9,0.935484,0.908721


## Train and Test Matcher

In [78]:
# RandomForest did slightly better with Precision, and I think we're more interested in removing FPs than
# maximizing TPs

In [79]:
rf.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
        target_attr='gold_label')



In [80]:
#import pickle

#with open('./rf_matcher.pkl', 'wb') as f:
#    pickle.dump(rf, f)

In [81]:
# Test
test = em.extract_feature_vecs(J, 
                               feature_table=F, 
                               attrs_after='gold_label',
                               show_progress=False)

In [82]:
# Predict on Test
rf.predict(table=test, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
            target_attr='prediction', append=True, inplace=True)


Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_ALTNAMES_tokTitleSelftext,dc_ws_ALTNAMES_tokTitleSelftext,cos_ws_ALTNAMES_tokTitleSelftext,street_name_present,num_alt_matches,not_street_name,generic_in_text,gold_label,prediction
85,13555,516,3331,0.013333,0.013333,0.028072,1,1,1,0,1,1
132,23564,516,7462,0.013158,0.013158,0.025392,1,1,1,0,1,1
96,16957,569,4850,0.117647,0.117647,0.117851,1,1,0,0,1,1
107,19149,1125,3490,0.023256,0.023256,0.036274,1,1,0,0,1,1
140,27119,570,8030,0.004988,0.004988,0.014080,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
200,38190,1030,9509,0.020833,0.020833,0.038815,1,2,0,0,0,0
135,24289,563,7736,0.105263,0.105263,0.105409,1,1,0,0,1,1
401,64320,2288,3133,0.068966,0.068966,0.091287,1,1,0,0,1,1
122,22297,348,6021,0.033898,0.033898,0.049507,1,1,1,1,1,1


In [83]:
TP = ((test['gold_label'] == 1) & (test['prediction'] == 1)).sum()
FP = ((test['gold_label'] == 0) & (test['prediction'] == 1)).sum()
TN = ((test['gold_label'] == 0) & (test['prediction'] == 0)).sum()
FN = ((test['gold_label'] == 1) & (test['prediction'] == 0)).sum()
P = (test['gold_label'] == 1).sum()
N = (test['gold_label'] == 0).sum()

In [84]:
# Test Accuracy
(TP+TN)/(P+N)

0.8288288288288288

In [85]:
# Check what the matcher misclassified
G.set_index('_id').loc[test[test['gold_label'] != test['prediction']]['_id']]

Unnamed: 0_level_0,ltable_id,rtable_id,ltable_NAME,ltable_GENERIC_NAME,rtable_tokTitle,rtable_tokSelftext,gold_label,ALTNAMES,NAMETYPE
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
71745,1880,7325,abyss,synthetic cannabinoids,change icon sub,tf speaker phone maybe shouting abyss hoping someone hear us,0,ninja terraband crystal skull krazy kandy amnesia funky buddha atomic blast juicy leaf blaze buz...,STREET_NAME
10680,41,2921,pharmacy,fentanyl,adderall snortable,got orange u31 tablets 30 mg things snortable purchased mexican pharmacy,0,nyl opes duragesic fentora fire onsolis fuf tango and cash chinese actiq humid gray stuff white ...,STREET_NAME
90369,2366,8883,roids,testosterone,snorting roids,got roids prednisone 10mg crushed pill snorted feel like adderall normal,1,juice testosterone muscle builders arnolds gym candy pumpers roids weight gainers steroids stackers,STREET_NAME
53554,1339,3003,amp,methamphetamine,top 3 combos,saw post top 3 combos would say start benzos amp amphetamines,1,mamph go-go bud light shiny girl unassembled crink tupperware cold stuff jug of water stove top ...,STREET_NAME
56131,1572,3333,weed,cannabis,smoking weed taking molly,smoke weed pop molly would still get typical effects change,1,queen anns lace root love weed herb shoes gigi dimba ditch weed indian hay dro ganja gato panama...,STREET_NAME
45949,1792,1152,blue dream,cannabis,29 3 thc,hit plug asked som blue dream said tested 29 3 thc good true,1,queen anns lace root love weed herb shoes gigi dimba ditch weed indian hay dro ganja gato panama...,STREET_NAME
81333,1572,8598,weed,cannabis,edm high underrated,listening edm high usually ties mdma smoking weed seriously underrated good,1,queen anns lace root love weed herb shoes gigi dimba ditch weed indian hay dro ganja gato panama...,STREET_NAME
66071,1302,6684,rocket fuel,methamphetamine,best street names,funniest best street names youve seen fav rocket fuel mdma,0,mamph go-go bud light shiny girl unassembled crink tupperware cold stuff jug of water stove top ...,STREET_NAME
41192,1339,1894,amp,methamphetamine,tolerance idk,3 farmapram amp half bottle tequila im still coherent,0,mamph go-go bud light shiny girl unassembled crink tupperware cold stuff jug of water stove top ...,STREET_NAME
89513,2273,10882,dxm,dxm,everyone right,bored dxm phenibut wondering anyone else fucked something right,0,poor man’s ecstasy x dxm triple c robotripping red devils tussin dextromethorphan robo drix dext...,GENERIC_NAME


## Match Full Blocked Dataset

In [86]:
real = em.extract_feature_vecs(C1, 
                               feature_table=F, 
                               show_progress=False)

In [87]:
rf.predict(table=real, exclude_attrs=['_id', 'ltable_id', 'rtable_id'],
            target_attr='prediction', return_probs=True, probs_attr='proba', append=True, inplace=True)
real.head()

Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_ALTNAMES_tokTitleSelftext,dc_ws_ALTNAMES_tokTitleSelftext,cos_ws_ALTNAMES_tokTitleSelftext,street_name_present,num_alt_matches,not_street_name,generic_in_text,prediction,proba
0,0,0,10,0.020202,0.020202,0.021272,1,2,1,1,1,0.9
1,1,0,51,0.034483,0.034483,0.035007,1,2,1,1,1,1.0
2,2,0,73,0.021505,0.021505,0.024254,1,1,1,1,1,1.0
3,3,0,117,0.039409,0.039409,0.041748,1,4,1,1,1,1.0
4,4,0,135,0.029963,0.029963,0.034386,1,4,1,1,1,1.0


# Create Datasets for Cypher

## Relationship table

In [None]:
# Create relationship table
rel_table = C1.merge(real[['_id', 'proba', 'prediction']], how='left', left_on='_id', right_on='_id')

In [None]:
final_rel_table = rel_table[rel_table['prediction'] == 1]
final_rel_table = final_rel_table.rename(columns={'ltable_id': 'id_names', 'rtable_id': 'id_reddit'})
final_rel_table = final_rel_table[['id_reddit', 'id_names', 'proba']]

In [None]:
final_rel_table.to_csv('./import_cypher/reddit_relations.csv', index=False)

## Final drug names

In [None]:
final_drugs_df = drugs_df.copy()
final_drugs_df = final_drugs_df.rename(columns={'NAMETYPE': 'name_type', 'NAME': 'name', 
                                                'FULL_GENERIC_NAME': 'full_generic_name',
                                                'CATEGORY': 'category', 'FDA_SCHEDULE': 'fda_schedule'})
final_drugs_df = final_drugs_df[['id', 'name', 'name_type', 'full_generic_name', 'category', 'fda_schedule']]
final_drugs_df['name_type'] = final_drugs_df['name_type'].str.lower()
final_drugs_df['full_generic_name'] = final_drugs_df['full_generic_name'].str.lower()
final_drugs_df['category'] = final_drugs_df['category'].str.lower()
final_drugs_df['fda_schedule'] = final_drugs_df['fda_schedule'].str.lower()

In [None]:
final_drugs_df.to_csv('./import_cypher/drug_names.csv', index=False)

# Final Reddit

In [None]:
from datetime import datetime

In [None]:
final_reddit = reddit_df.copy()
final_reddit['created'] = final_reddit.apply(lambda x: datetime.utcfromtimestamp(x['created']).strftime('%Y-%m-%d'),
                                             axis=1)
final_reddit['id'] = final_reddit.index
final_reddit = final_reddit[['id', 'author', 'created', 'title', 
                             'link_flair_text', 'num_comments', 'score', 'upvote_ratio', 'reddit_id']]

In [None]:
final_reddit

In [None]:
final_reddit.to_csv('./import_cypher/reddit.csv', index=False)