In [20]:
import numpy as np
import pandas as pd
import py_stringmatching as sm
import py_stringsimjoin as sj
import py_entitymatching as em

from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import brown

# Drugs DF

In [33]:
#DRUG_NAMES = './data/cleaned_files/drug_names_fda_VA_filled.csv'
DRUG_NAMES = './data/cleaned_files/generic_names.xlsx'

In [35]:
# Drugs DataFrame
drugs_df = pd.read_excel(DRUG_NAMES)
drugs_df

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv
...,...,...,...,...,...,...
355,355,testosterone,generic_name,testosterone,"hormones/synthetics/modifiers,other",schedule iii
356,356,hydromorphone,generic_name,hydromorphone,opioid analgesics,schedule ii
357,357,oxymorphone,generic_name,oxymorphone hydrochloride extended-release tablets,opioid analgesics,schedule ii
358,358,methadone,generic_name,methadone hydrochloride tablets,opioid analgesics,schedule ii


In [None]:
#drugs_df['NAME'] = drugs_df['NAME'].str.lower()
#drugs_df['GENERIC_NAME'] = drugs_df['GENERIC_NAME'].str.lower()

In [37]:
#drugs_df = drugs_df.drop(['name'], axis=1).drop(drugs_df[drugs_df.NAME_TYPE != 'GENERIC_NAME'].index).reset_index(drop=True)
drugs_df.head()

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv


In [38]:
#!head -n 2 $DRUG_NAMES

In [40]:
'''
# Alternative names
alt_names = defaultdict(set)
with open(DRUG_NAMES, 'r') as f:
    header = True
    for line in f:
        if header:
            header = False
            continue
            
        line = line.lower().split(',')
        name = line[0]
        gen_name = line[1]
        
        alt_names[gen_name].add(name)
        
for k, v in alt_names.items():
    alt_names[k] = ' '.join(v)
'''

"\n# Alternative names\nalt_names = defaultdict(set)\nwith open(DRUG_NAMES, 'r') as f:\n    header = True\n    for line in f:\n        if header:\n            header = False\n            continue\n            \n        line = line.lower().split(',')\n        name = line[0]\n        gen_name = line[1]\n        \n        alt_names[gen_name].add(name)\n        \nfor k, v in alt_names.items():\n    alt_names[k] = ' '.join(v)\n"

In [26]:
#drugs_df['ALTNAMES'] = drugs_df.apply(lambda x: alt_names[x['GENERIC_NAME']], axis=1)
#drugs_df = drugs_df.rename(columns={'NAME_TYPE': 'NAMETYPE'})

In [41]:
drugs_df.head()

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv


In [42]:
drugs_df.shape

(360, 6)

## Stop words and common words

In [43]:
# Init tokenizer and nltk stopwords
tok = sm.AlphanumericTokenizer(return_set=True)
stop_words = set(stopwords.words('english')) 

In [44]:
# These overlap with some of the street names, removing
stop_words.add('drug')
stop_words.add('drugs')

In [45]:
# Common English words from Brown corpus
brown_counter = Counter()
brown_corpus = [w.lower() for w in brown.words()]
brown_words = [w for w in brown_corpus if w not in stop_words]
brown_counter.update(brown_words)

In [46]:
# Check to see which words are common
remove_count = []
for name in drugs_df['name'].unique():
    if name in brown_counter and brown_counter[name] >= 15:
        remove_count.append((name, brown_counter[name]))

In [47]:
sorted(remove_count, key = lambda x: x[1], reverse=True)

[('opium', 16)]

In [48]:
# Remove common words and stop words
# Keep opium
remove = []
for name in drugs_df['name'].unique():
    if name == 'opium':
        continue
    if (name in brown_counter and brown_counter[name] >= 15) or (name in stop_words):
        remove.append(name)

In [49]:
drugs_df = drugs_df[~drugs_df['name'].isin(remove)].copy().reset_index(drop=True)

In [50]:
drugs_df.shape

(360, 6)

# Process Reddit data

In [51]:
REDDIT_DATA = './data/usnewspaper-Jun-Aug_en_cleaned.xlsx'

In [52]:
reddit_df = pd.read_excel(REDDIT_DATA)

In [53]:
reddit_df.head()

Unnamed: 0.1,Unnamed: 0,publishdate,src,title,news
0,0,2020-06-01,https://www.prnewswire.com/news-releases/,a natural partnership: popsockets & burt`s bees collaborate on a new product,popgrip lips x burt`s bees will provide consumers with a two-in-one product that features the cl...
1,1,2020-06-01,http://www.reuters.com/,keep your distance: people queue for school and ikea in england,"warrington, england (reuters) - thousands of people across england queued up for school and ikea..."
2,2,2020-06-01,http://www.aljazeera.com/,mapping us cities where george floyd protests have erupted,"demonstrations have erupted in dozens of us cities after george floyd, an unarmed black man, die..."
3,3,2020-06-01,https://www.dallasnews.com/,enjoy the dallas museum of art’s exploration of home from the comfort of yours,"what does it mean for a house to be a home? for one thing, it means that a structure built of in..."
4,4,2020-06-01,https://www.dallasnews.com/,"target opens some dallas stores as protesters move overnight, causing damage to west end and upt...",the protests that have spread around the country are closing stores at a time when retailers are...


In [54]:
reddit_df = reddit_df.rename(columns={'id': 'reddit_id'})

In [55]:
reddit_df.shape

(443442, 5)

In [56]:
# Remove unpopular posts
#reddit_df = reddit_df[reddit_df['score'] > 0].copy().reset_index(drop=True)
#reddit_df.shape

## Stop words

In [57]:
def process_text(text, tok, stop_words, join=False):
    text = text.strip().lower().replace('\\n\\n', ' ')
    result = [w for w in tok.tokenize(text) if w not in stop_words]
    if join:
        result = ' '.join(result)
    return result

In [58]:
# Tokenize and remove stopwords from selftext and titles
reddit_df['tokTitle'] = reddit_df.apply(lambda x: process_text(x['title'], tok, stop_words), axis=1)
reddit_df['tokNews'] = reddit_df.apply(lambda x: process_text(x['news'], tok, stop_words), axis=1)

In [59]:
# Just to check the frequency of words in reddit text
reddit_counter = Counter()
reddit_df['tokTitle'].apply(reddit_counter.update)
reddit_df['tokNews'].apply(reddit_counter.update)
#print(reddit_counter)

0         None
1         None
2         None
3         None
4         None
          ... 
443437    None
443438    None
443439    None
443440    None
443441    None
Name: tokNews, Length: 443442, dtype: object

In [60]:
# Join text
reddit_df['tokTitle'] = reddit_df.apply(lambda x: process_text(x['title'], tok, stop_words, join=True), axis=1)
reddit_df['tokNews'] = reddit_df.apply(lambda x: process_text(x['news'], tok, stop_words, join=True), axis=1)

# Entity Matching - drugs_df

name     name_type	full_generic_name	category	fda_schedule

In [61]:
l_output_attrs = ['name', 'category', 'fda_schedule']
r_output_attrs = ['tokTitle', 'tokNews']

In [62]:
A = drugs_df
B = reddit_df

In [63]:
A['id'] = A.index
B['id'] = B.index

In [64]:
em.set_key(A, 'id')
em.set_key(B, 'id')

True

In [65]:
#A['ALTNAMES'] = A['ALTNAMES'].astype(str)

In [66]:
atypes_A = em.get_attr_types(A)
atypes_B = em.get_attr_types(B)

In [67]:
[(col, atypes_A[col]) for col in A.columns]

[('id', 'numeric'),
 ('name', 'str_bt_1w_5w'),
 ('name_type', 'str_eq_1w'),
 ('full_generic_name', 'str_bt_1w_5w'),
 ('category', 'str_bt_1w_5w'),
 ('fda_schedule', 'str_bt_1w_5w')]

In [68]:
[(col, atypes_B[col]) for col in B.columns]

[('Unnamed: 0', 'numeric'),
 ('publishdate', 'str_eq_1w'),
 ('src', 'str_eq_1w'),
 ('title', 'str_gt_10w'),
 ('news', 'str_gt_10w'),
 ('tokTitle', 'str_bt_5w_10w'),
 ('tokNews', 'str_gt_10w'),
 ('id', 'numeric')]

In [69]:
em.get_attr_corres(A, B)['corres']

[('id', 'id')]

In [70]:
atypes_A['name'] = atypes_B['tokNews']
#atypes_A['GENERIC_NAME'] = atypes_B['tokNews']
atypes_B['tokTitle'] = atypes_B['tokNews']

In [71]:
block_c = em.get_attr_corres(A, B)
block_c['corres'] = [('name', 'tokTitle'), ('name', 'tokNews')]

In [72]:
toks = em.get_tokenizers_for_blocking()
toks

{'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram(s)>,
 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram(s)>,
 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace(input_string)>,
 'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic(input_string)>,
 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric(input_string)>,
 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim(s)>}

In [73]:
sim = em.get_sim_funs_for_blocking()
sim

{'affine': <function py_entitymatching.feature.simfunctions.affine(s1, s2)>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist(s1, s2)>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim(s1, s2)>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist(s1, s2)>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim(s1, s2)>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro(s1, s2)>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler(s1, s2)>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch(s1, s2)>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman(s1, s2)>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff(arr1, arr2)>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard(arr1, arr2)>,
 'dice': <function py_entitymatching.feature.simfunctions

In [74]:
block_f = em.get_features(A, B, atypes_A, atypes_B, block_c, toks, sim)

In [75]:
block_f

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,name_tokTitle_jac_qgm_3_qgm_3,name,tokTitle,qgm_3,qgm_3,jaccard,<function name_tokTitle_jac_qgm_3_qgm_3 at 0x00000255AB94D798>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,name_tokTitle_cos_dlm_dc0_dlm_dc0,name,tokTitle,dlm_dc0,dlm_dc0,cosine,<function name_tokTitle_cos_dlm_dc0_dlm_dc0 at 0x00000255AB94D8B8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,name_tokNews_jac_qgm_3_qgm_3,name,tokNews,qgm_3,qgm_3,jaccard,<function name_tokNews_jac_qgm_3_qgm_3 at 0x00000255AB94DCA8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,name_tokNews_cos_dlm_dc0_dlm_dc0,name,tokNews,dlm_dc0,dlm_dc0,cosine,<function name_tokNews_cos_dlm_dc0_dlm_dc0 at 0x00000255AB94DD38>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [76]:
# Black box function that checks if NAME is exactly in the reddit text
def street_name_titleselftext_exact(x, y):
    x = x['name']
    y = y['tokTitle'] + ' ' + y['tokNews']
    return not x in y

In [77]:
# Blocker to match NAMES and reddit text
# This blocker gives us all of the candidate relationships, which are a lot since there may be multiple 
# relationships with a single Reddit post
bb = em.BlackBoxBlocker()
bb.set_black_box_function(street_name_titleselftext_exact)

In [78]:
C1 = bb.block_tables(A, B, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, show_progress=True,
                     n_jobs=-1)

In [79]:
C1

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_category,ltable_fda_schedule,rtable_tokTitle,rtable_tokNews
0,0,0,7490,fentanyl,opioid analgesics,schedule ii,cpj 100 press freedom violations reported us protests,media watchdog called us authorities stop targeting journalists covering protests death unarmed ...
1,1,0,7668,fentanyl,opioid analgesics,schedule ii,medical examiner private autopsy find george floyd death homicide,minneapolis medical examiner monday classified george floyd death homicide saying heart stopped ...
2,2,0,8299,fentanyl,opioid analgesics,schedule ii,george floyd died asphyxia family medical examiner autopsy,george floyd killed asphyxia due neck back compression according independent medical examiner re...
3,3,0,8592,fentanyl,opioid analgesics,schedule ii,ex boxer floyd mayweather paying george funeral services per reports,tmz sports published photo 88 500 check mayweather houston area funeral provider fort bend memor...
4,4,0,8809,fentanyl,opioid analgesics,schedule ii,medical examiner floyd heart stopped restrained,significant conditions said floyd suffered heart disease hypertension listed fentanyl intoxicati...
...,...,...,...,...,...,...,...,...
32650,32650,358,382170,methadone,opioid analgesics,schedule ii,covid 19 pandemic could worsening addictions overdoses,covid 19 pandemic disrupted routines vast majority americans left many isolated alone battling a...
32651,32651,358,395065,methadone,opioid analgesics,schedule ii,former doctor pleads guilty illegally prescribing pain pills officials say,former doctor amityville pleaded guilty monday illegally prescribing thousands highly addictive ...
32652,32652,358,396384,methadone,opioid analgesics,schedule ii,hotel lucerne upper west side converts temporary homeless shelter,sign special edition newsletter get daily update coronavirus pandemic normally live let upper we...
32653,32653,358,437497,methadone,opioid analgesics,schedule ii,people color cited often skipping fares mbta,response globe public records request transit police provided two months worth fare evasion cita...


 # Save C1 to csv

In [54]:
C1.to_csv('C1_output.csv',index=False)
#C1 = pd.read_csv('C1_output.csv')

In [55]:
# Check to see what didn't get matched
missing_reddit = ~reddit_df['id'].isin(C1['rtable_id'])

In [57]:
reddit_df[missing_reddit]['tokNews']

264                                                                            find money manager data online
268                                                                            find money manager data online
351                              breeding horses fairly simple follow basic guide full sized horse ranch time
376                              breeding horses fairly simple follow basic guide full sized horse ranch time
390       santa monica extended curfew 8 p sunday 5 30 monday protesters george floyd death marched peacef...
                                                         ...                                                 
443256                                                                            permission edit video close
443283                                                        choose plan right digital access print delivery
443286                                                        choose plan right digital access print delivery
443392    

In [60]:
# Blocker to get a subset of the above for manual labeling
rb = em.RuleBasedBlocker()
rb.add_rule(['GENERIC_NAME_tokNews_cos_dlm_dc0_dlm_dc0 	(ltuple, rtuple) < 0.3'], block_f)

'_rule_0'

In [61]:
C2 = rb.block_candset(C1, show_progress=True, n_jobs=-1)

In [65]:
#C2.to_csv('C2_output.csv', index=False)
C2 = pd.read_csv('C2_output.csv')

# Label Block

In [None]:
# Generate manually labeled data set

G = em.label_table(C2, 'gold_label')
G.to_csv('./C2_labeled_news.csv', index=False)

Column name (gold_label) is not present in dataframe


In [64]:
pip install PyQt5

Collecting PyQt5
  Downloading PyQt5-5.15.2-5.15.2-cp35.cp36.cp37.cp38.cp39-none-win_amd64.whl (56.9 MB)
Collecting PyQt5-sip<13,>=12.8
  Using cached PyQt5_sip-12.8.1-cp37-cp37m-win_amd64.whl (62 kB)
Installing collected packages: PyQt5-sip, PyQt5
Successfully installed PyQt5-5.15.2 PyQt5-sip-12.8.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\programdata\anaconda3\envs\myenv\python.exe -m pip install --upgrade pip' command.


In [None]:
#G = em.read_csv_metadata('./C2_labeled.csv', key='_id',
#                      fk_ltable='ltable_id', fk_rtable='rtable_id',
#                      ltable=A, rtable=B)

In [None]:
# I needed to add features that weren't included in the labeled data set

#G.merge(A[['id', 'ALTNAMES']], how='left', left_on='ltable_id', right_on='id').drop(columns=['id']).to_csv('./C2_labeled_2.csv', index=False)
#G.merge(A[['id', 'NAMETYPE']], how='left', left_on='ltable_id', right_on='id').drop(columns=['id']).to_csv('./C2_labeled_3.csv', index=False)

In [50]:
# Final labeled set with added features
G = em.read_csv_metadata('./C2_labeled_3.csv', key='_id',
                      fk_ltable='ltable_id', fk_rtable='rtable_id',
                      ltable=A, rtable=B)

Metadata file is not present in the given path; proceeding to read the csv file.


# Matching

In [51]:
# Split data into train and test
IJ = em.split_train_test(G, train_proportion=0.6, random_state=0)
I = IJ['train']
J = IJ['test']

In [52]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [53]:
A = A.rename(columns={'NAME_TYPE': 'NAMETYPE'})

In [54]:
A.columns

Index(['NAME', 'GENERIC_NAME', 'NAMETYPE', 'FULL_GENERIC_NAME', 'FDA_SCHEDULE',
       'VA_CLASS', 'VA_CATEGORY', 'CATEGORY', 'ALTNAMES', 'id'],
      dtype='object')

In [55]:
B.columns

Index(['author', 'created', 'link_flair_text', 'reddit_id', 'num_comments',
       'num_crossposts', 'score', 'selftext', 'title', 'total_awards_received',
       'upvote_ratio', 'Week', 'tokTitle', 'tokSelftext', 'id'],
      dtype='object')

In [56]:
A = A.drop(columns=['FULL_GENERIC_NAME', 'FDA_SCHEDULE', 'VA_CLASS', 'VA_CATEGORY', 'CATEGORY'])
B = B.drop(columns=['author', 'link_flair_text', 'num_comments', 'num_crossposts', 'score', 'selftext', 'title',
                    'total_awards_received', 'upvote_ratio', 'Week'])

In [57]:
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [58]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x7fa8c21a4cb0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x7fa8c86514d0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,id_id_lev_dist,id,id,,,lev_dist,<function id_id_lev_dist at 0x7fa8c18de0e0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,id_id_lev_sim,id,id,,,lev_sim,<function id_id_lev_sim at 0x7fa8c18de200>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


## Generate Features

In [59]:
# Matching NAME to the Reddit text using string and token similarity doesn't really make sense since the
# Reddit text is long while the names are 1-2 words. There may be some value in similarity scores between all
# of the ALTNAMES and the Reddit text, especially in cases where multiple alt words are used in the text. However,
# the length of a post or a short list of alt names may arbitrarily lower these scores.

In [60]:
feature_string = '''dice(wspace(ltuple['ALTNAMES']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokSelftext'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'jac_ws_ALTNAMES_tokTitleSelftext', feature)

True

In [61]:
feature_string = '''dice(wspace(ltuple['ALTNAMES']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokSelftext'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'dc_ws_ALTNAMES_tokTitleSelftext', feature)

True

In [62]:
feature_string = '''cosine(wspace(ltuple['ALTNAMES']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokSelftext'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'cos_ws_ALTNAMES_tokTitleSelftext', feature)

True

In [63]:
# NAME in Reddit text
def street_name_titleselftext_exact_feature(x, y):
    x = x['NAME']
    y = y['tokTitle'] + ' ' + y['tokSelftext']
    return int(x in y)

In [64]:
em.add_blackbox_feature(F, 'street_name_present', street_name_titleselftext_exact_feature)

True

In [65]:
# Number of alt names in Reddit text
def num_alt_matches(x, y):
    x = set(x['ALTNAMES'].split(' '))
    y = set(y['tokTitle'].split(' ')).union(set(y['tokSelftext'].split(' ')))
    return len(x.intersection(y))

In [66]:
em.add_blackbox_feature(F, 'num_alt_matches', num_alt_matches)

True

In [67]:
# Name type is not street name (to give weight to brand and generic names)
def not_street_name(x, y):
    return int(not x['NAMETYPE'] == 'STREET_NAME')

In [68]:
em.add_blackbox_feature(F, 'not_street_name', not_street_name)

True

In [69]:
# Generic name is in Reddit text
def generic_in_text(x, y):
    x = x['GENERIC_NAME']
    y = y['tokTitle'] + ' ' + y['tokSelftext']
    return int(x in y)

In [70]:
em.add_blackbox_feature(F, 'generic_in_text', generic_in_text)

True

In [71]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x7fa8c21a4cb0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x7fa8c86514d0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,id_id_lev_dist,id,id,,,lev_dist,<function id_id_lev_dist at 0x7fa8c18de0e0>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,id_id_lev_sim,id,id,,,lev_sim,<function id_id_lev_sim at 0x7fa8c18de200>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,jac_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7fa8c18de8c0>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
5,dc_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7fa8c18de560>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
6,cos_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7fa8c18dd4d0>,"def fn(ltuple, rtuple):\n return cosine(wspace(ltuple['ALTNAMES']),\n ...",False
7,street_name_present,,,,,,<function street_name_titleselftext_exact_feature at 0x7fa8c1f9eb00>,,False
8,num_alt_matches,,,,,,<function num_alt_matches at 0x7fa8c1f9e710>,,False
9,not_street_name,,,,,,<function not_street_name at 0x7fa8c1f93680>,,False


In [72]:
# Remove id based features
F = F[~F['feature_name'].str.contains('id')].copy().reset_index(drop=True)

In [73]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,jac_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7fa8c18de8c0>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
1,dc_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7fa8c18de560>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['ALTNAMES']),\n ...",False
2,cos_ws_ALTNAMES_tokTitleSelftext,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x7fa8c18dd4d0>,"def fn(ltuple, rtuple):\n return cosine(wspace(ltuple['ALTNAMES']),\n ...",False
3,street_name_present,,,,,,<function street_name_titleselftext_exact_feature at 0x7fa8c1f9eb00>,,False
4,num_alt_matches,,,,,,<function num_alt_matches at 0x7fa8c1f9e710>,,False
5,not_street_name,,,,,,<function not_street_name at 0x7fa8c1f93680>,,False
6,generic_in_text,,,,,,<function generic_in_text at 0x7fa8c1f93200>,,False


## Select Matcher

In [74]:
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)

In [75]:
H.head()

Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_ALTNAMES_tokTitleSelftext,dc_ws_ALTNAMES_tokTitleSelftext,cos_ws_ALTNAMES_tokTitleSelftext,street_name_present,num_alt_matches,not_street_name,generic_in_text,gold_label
68,10832,134,4016,0.033113,0.033113,0.065089,1,5,0,1,1
210,39484,1126,8721,0.065217,0.065217,0.086031,1,3,0,0,1
20,2807,368,2163,0.046512,0.046512,0.051848,1,1,1,0,1
310,51000,2196,652,0.0,0.0,0.0,0,0,0,0,1
342,52955,1276,3532,0.011765,0.011765,0.029604,1,1,0,0,1


In [76]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
        k=5,
        target_attr='gold_label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.896476,0.912038,0.903968
1,RF,0.893085,0.945115,0.918201
2,SVM,0.833831,1.0,0.908721
3,LinReg,0.833831,1.0,0.908721
4,LogReg,0.833831,1.0,0.908721


In [77]:
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7fa8c9a22210>,5,0.87931,0.88,0.946429,0.875,0.901639,0.896476
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7fa8c9a22590>,5,0.862069,0.854545,0.933333,0.910714,0.904762,0.893085
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7fa8c9a22410>,5,0.835821,0.757576,0.878788,0.818182,0.878788,0.833831
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7fa8c9aa9b90>,5,0.835821,0.757576,0.878788,0.818182,0.878788,0.833831
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7fa8c9acfcd0>,5,0.835821,0.757576,0.878788,0.818182,0.878788,0.833831


In [78]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7fa8c9a22210>,5,0.910714,0.88,0.913793,0.907407,0.948276,0.912038
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7fa8c9a22590>,5,0.892857,0.94,0.965517,0.944444,0.982759,0.945115
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7fa8c9a22410>,5,1.0,1.0,1.0,1.0,1.0,1.0
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7fa8c9aa9b90>,5,1.0,1.0,1.0,1.0,1.0,1.0
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7fa8c9acfcd0>,5,1.0,1.0,1.0,1.0,1.0,1.0


In [79]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7fa8c9a22210>,5,0.894737,0.88,0.929825,0.890909,0.92437,0.903968
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7fa8c9a22590>,5,0.877193,0.895238,0.949153,0.927273,0.942149,0.918201
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7fa8c9a22410>,5,0.910569,0.862069,0.935484,0.9,0.935484,0.908721
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7fa8c9aa9b90>,5,0.910569,0.862069,0.935484,0.9,0.935484,0.908721
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7fa8c9acfcd0>,5,0.910569,0.862069,0.935484,0.9,0.935484,0.908721


## Train and Test Matcher

In [80]:
# RandomForest did slightly better with Precision, and I think we're more interested in removing FPs than
# maximizing TPs

In [81]:
rf.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
        target_attr='gold_label')



In [82]:
#import pickle

#with open('./rf_matcher.pkl', 'wb') as f:
#    pickle.dump(rf, f)

In [83]:
# Test
test = em.extract_feature_vecs(J, 
                               feature_table=F, 
                               attrs_after='gold_label',
                               show_progress=False)

In [84]:
# Predict on Test
rf.predict(table=test, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
            target_attr='prediction', append=True, inplace=True)


Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_ALTNAMES_tokTitleSelftext,dc_ws_ALTNAMES_tokTitleSelftext,cos_ws_ALTNAMES_tokTitleSelftext,street_name_present,num_alt_matches,not_street_name,generic_in_text,gold_label,prediction
85,13555,516,3331,0.013333,0.013333,0.028072,1,1,1,0,1,1
132,23564,516,7462,0.013158,0.013158,0.025392,1,1,1,0,1,1
96,16957,569,4850,0.117647,0.117647,0.117851,1,1,0,0,1,1
107,19149,1125,3490,0.023256,0.023256,0.036274,1,1,0,0,1,1
140,27119,570,8030,0.004988,0.004988,0.014080,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
200,38190,1030,9509,0.020833,0.020833,0.038815,1,2,0,0,0,0
135,24289,563,7736,0.105263,0.105263,0.105409,1,1,0,0,1,1
401,64320,2288,3133,0.066667,0.066667,0.083333,0,1,0,0,1,1
122,22297,348,6021,0.033898,0.033898,0.049507,1,1,1,1,1,1


In [85]:
TP = ((test['gold_label'] == 1) & (test['prediction'] == 1)).sum()
FP = ((test['gold_label'] == 0) & (test['prediction'] == 1)).sum()
TN = ((test['gold_label'] == 0) & (test['prediction'] == 0)).sum()
FN = ((test['gold_label'] == 1) & (test['prediction'] == 0)).sum()
P = (test['gold_label'] == 1).sum()
N = (test['gold_label'] == 0).sum()

In [86]:
# Test Accuracy
(TP+TN)/(P+N)

0.8243243243243243

In [87]:
# Check what the matcher misclassified
G.set_index('_id').loc[test[test['gold_label'] != test['prediction']]['_id']]

Unnamed: 0_level_0,ltable_id,rtable_id,ltable_NAME,ltable_GENERIC_NAME,rtable_tokTitle,rtable_tokSelftext,gold_label,ALTNAMES,NAMETYPE
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
71745,1880,7325,abyss,synthetic cannabinoids,change icon sub,tf speaker phone maybe shouting abyss hoping someone hear us,0,ninja terraband crystal skull krazy kandy amnesia funky buddha atomic blast juicy leaf blaze buz...,STREET_NAME
10680,41,2921,pharmacy,fentanyl,adderall snortable,got orange u31 tablets 30 mg things snortable purchased mexican pharmacy,0,nyl opes duragesic fentora fire onsolis fuf tango and cash chinese actiq humid gray stuff white ...,STREET_NAME
53554,1339,3003,amp,methamphetamine,top 3 combos,saw post top 3 combos would say start benzos amp amphetamines,1,mamph go-go bud light shiny girl unassembled crink tupperware cold stuff jug of water stove top ...,STREET_NAME
56131,1572,3333,weed,cannabis,smoking weed taking molly,smoke weed pop molly would still get typical effects change,1,queen anns lace root love weed herb shoes gigi dimba ditch weed indian hay dro ganja gato panama...,STREET_NAME
30566,41,9740,pharmacy,fentanyl,safe cold water extraction method cocodamol,recently found buy cocodamol pharmacy uk safe cold water extraction method,0,nyl opes duragesic fentora fire onsolis fuf tango and cash chinese actiq humid gray stuff white ...,STREET_NAME
45949,1792,1152,blue dream,cannabis,29 3 thc,hit plug asked som blue dream said tested 29 3 thc good true,1,queen anns lace root love weed herb shoes gigi dimba ditch weed indian hay dro ganja gato panama...,STREET_NAME
81333,1572,8598,weed,cannabis,edm high underrated,listening edm high usually ties mdma smoking weed seriously underrated good,1,queen anns lace root love weed herb shoes gigi dimba ditch weed indian hay dro ganja gato panama...,STREET_NAME
78566,1276,10438,meth,methamphetamine,meth,really find much info meth brain one accidental usage eaten,1,mamph go-go bud light shiny girl unassembled crink tupperware cold stuff jug of water stove top ...,STREET_NAME
41192,1339,1894,amp,methamphetamine,tolerance idk,3 farmapram amp half bottle tequila im still coherent,0,mamph go-go bud light shiny girl unassembled crink tupperware cold stuff jug of water stove top ...,STREET_NAME
89513,2273,10882,dxm,dxm,everyone right,bored dxm phenibut wondering anyone else fucked something right,0,poor man’s ecstasy x dxm triple c robotripping red devils tussin dextromethorphan robo drix dext...,GENERIC_NAME


## Match Full Blocked Dataset

In [88]:
real = em.extract_feature_vecs(C1, 
                               feature_table=F, 
                               show_progress=False)

In [89]:
rf.predict(table=real, exclude_attrs=['_id', 'ltable_id', 'rtable_id'],
            target_attr='prediction', return_probs=True, probs_attr='proba', append=True, inplace=True)
real.head()

Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_ALTNAMES_tokTitleSelftext,dc_ws_ALTNAMES_tokTitleSelftext,cos_ws_ALTNAMES_tokTitleSelftext,street_name_present,num_alt_matches,not_street_name,generic_in_text,prediction,proba
0,0,0,10,0.020202,0.020202,0.021272,1,2,1,1,1,0.9
1,1,0,51,0.034483,0.034483,0.035007,1,2,1,1,1,1.0
2,2,0,73,0.021505,0.021505,0.024254,1,1,1,1,1,1.0
3,3,0,117,0.039409,0.039409,0.041748,1,4,1,1,1,1.0
4,4,0,135,0.029963,0.029963,0.034386,1,4,1,1,1,1.0


# Create Datasets for Cypher

## Generic-Reddit Relationship Table

In [90]:
# Create relationship table
rel_table = C1.merge(real[['_id', 'proba', 'prediction']], how='left', left_on='_id', right_on='_id')

In [91]:
final_rel_table = rel_table[rel_table['prediction'] == 1]
final_rel_table = final_rel_table.rename(columns={'ltable_id': 'id_names', 'rtable_id': 'id_reddit'})
final_rel_table = final_rel_table[['id_reddit', 'id_names', 'proba']]

In [92]:
final_rel_table.to_csv('./import_cypher/reddit_relations.csv', index=False)

## Final drug names

In [93]:
final_drugs_df = drugs_df.copy()
final_drugs_df = final_drugs_df.rename(columns={'NAMETYPE': 'name_type', 'NAME': 'name', 
                                                'FULL_GENERIC_NAME': 'full_generic_name',
                                                'CATEGORY': 'category', 'FDA_SCHEDULE': 'fda_schedule'})
final_drugs_df = final_drugs_df[['id', 'name', 'name_type', 'full_generic_name', 'category', 'fda_schedule']]
final_drugs_df['name_type'] = final_drugs_df['name_type'].str.lower()
final_drugs_df['full_generic_name'] = final_drugs_df['full_generic_name'].str.lower()
final_drugs_df['category'] = final_drugs_df['category'].str.lower()
final_drugs_df['category'] = final_drugs_df['category'].fillna('unknown')
final_drugs_df['fda_schedule'] = final_drugs_df['fda_schedule'].str.lower()
final_drugs_df['fda_schedule'] = final_drugs_df['fda_schedule'].fillna('unknown')

In [94]:
final_drugs_df.to_csv('./import_cypher/drug_names.csv', index=False)

In [95]:
final_drugs_df

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,apache,street_name,fentanyl,opioid analgesics,schedule ii
2,2,birria,street_name,fentanyl,opioid analgesics,schedule ii
3,3,coca,generic_name,coca,opioid analgesics,schedule ii
4,4,blue diamond,street_name,fentanyl,opioid analgesics,schedule ii
...,...,...,...,...,...,...
2370,2370,steroids,street_name,testosterone,"hormones/synthetics/modifiers,other",schedule iii
2371,2371,hydromorphone,generic_name,hydromorphone,opioid analgesics,schedule ii
2372,2372,oxymorphone,generic_name,oxymorphone hydrochloride extended-release tablets,opioid analgesics,schedule ii
2373,2373,methadone,generic_name,methadone hydrochloride tablets,opioid analgesics,schedule ii


## Generic Names

In [96]:
generic_names_df = final_drugs_df[final_drugs_df['name_type'] == 'generic_name'].drop_duplicates(subset=['name']).copy()
generic_names_df = generic_names_df.reset_index(drop=True)
generic_names_df['id'] = generic_names_df.index

In [97]:
generic_names_df.to_csv('./import_cypher/generic_names.csv', index=False)

In [98]:
generic_names_df

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv
...,...,...,...,...,...,...
355,355,testosterone,generic_name,testosterone,"hormones/synthetics/modifiers,other",schedule iii
356,356,hydromorphone,generic_name,hydromorphone,opioid analgesics,schedule ii
357,357,oxymorphone,generic_name,oxymorphone hydrochloride extended-release tablets,opioid analgesics,schedule ii
358,358,methadone,generic_name,methadone hydrochloride tablets,opioid analgesics,schedule ii


## Names

In [99]:
alt_names_df = final_drugs_df[['id', 'name']].copy()

In [100]:
alt_names_df.to_csv('./import_cypher/alt_names.csv', index=False)

## Generic Names-Names Relationship Table

In [101]:
gen_alt_rel_table = drugs_df.merge(generic_names_df[['id','name']].rename(columns={'id': 'id_generic'}), 
                                     how='left', left_on='GENERIC_NAME', right_on='name')
gen_alt_rel_table = gen_alt_rel_table.rename(columns={'id': 'id_alt'})
gen_alt_rel_table = gen_alt_rel_table[['id_alt', 'id_generic']]
gen_alt_rel_table['id_generic'] = gen_alt_rel_table['id_generic'].astype(int)

In [109]:
gen_alt_rel_table.to_csv('./import_cypher/gen_alt_rel_table.csv', index=False)

# Final Reddit

In [104]:
from datetime import datetime

In [105]:
final_reddit = reddit_df.copy()
final_reddit['created'] = final_reddit.apply(lambda x: datetime.utcfromtimestamp(x['created']).strftime('%Y-%m-%d'),
                                             axis=1)
final_reddit['link_flair_text'] = final_reddit['link_flair_text'].fillna('none')
final_reddit['id'] = final_reddit.index
final_reddit = final_reddit[['id', 'author', 'created', 'title', 
                             'link_flair_text', 'num_comments', 'score', 'upvote_ratio', 'reddit_id']]

In [106]:
final_reddit

Unnamed: 0,id,author,created,title,link_flair_text,num_comments,score,upvote_ratio,reddit_id
0,0,Machinexa2,2020-06-22,Cannabis + Betel/Areca nut + Bacopa Monnieri = AWESOME,I :love: Drugs,0,1,1.00,hdn9ur
1,1,ForthName,2020-06-22,Do not sleep on acid,none,17,1,1.00,hdn7sj
2,2,oihavequestions,2020-06-22,Will adderal help someone pass a drug test?,none,1,1,1.00,hdn5nl
3,3,xxxPeniBanini,2020-06-22,NEED HELP WITH METHYLPHENIDATE,Stimulants,1,1,1.00,hdn2dx
4,4,SnooPets8599,2020-06-22,How to package drugs and send to another country,none,1,1,1.00,hdmzdg
...,...,...,...,...,...,...,...,...,...
11224,11224,cosmic_her0,2020-06-08,I’m an alcoholic.,none,252,1116,0.98,gyv085
11225,11225,snowverdose69,2020-06-08,stuck in a loop,MDMA,2,1,0.67,gyuz6k
11226,11226,Zemm_,2020-06-08,"Since I smoked weed on acid, my trips haven't been the same, why?",Psychedelics,5,10,0.86,gyuyg4
11227,11227,gucciwillis,2020-06-08,when will i feel normal again (after meth),Stimulants,8,2,1.00,gyuw94


In [108]:
final_reddit.to_csv('./import_cypher/reddit.csv', index=False)