In [1]:
import numpy as np
import pandas as pd
import py_stringmatching as sm
import py_stringsimjoin as sj
import py_entitymatching as em

from collections import defaultdict
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import brown

# Drugs DF

In [2]:
#DRUG_NAMES = './data/cleaned_files/drug_names_fda_VA_filled.csv'
DRUG_NAMES = './data/cleaned_files/generic_names.xlsx'

In [3]:
# Drugs DataFrame
drugs_df = pd.read_excel(DRUG_NAMES)
drugs_df

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv
...,...,...,...,...,...,...
355,355,testosterone,generic_name,testosterone,"hormones/synthetics/modifiers,other",schedule iii
356,356,hydromorphone,generic_name,hydromorphone,opioid analgesics,schedule ii
357,357,oxymorphone,generic_name,oxymorphone hydrochloride extended-release tablets,opioid analgesics,schedule ii
358,358,methadone,generic_name,methadone hydrochloride tablets,opioid analgesics,schedule ii


In [4]:
#drugs_df['NAME'] = drugs_df['NAME'].str.lower()
#drugs_df['GENERIC_NAME'] = drugs_df['GENERIC_NAME'].str.lower()

In [5]:
#drugs_df = drugs_df.drop(['name'], axis=1).drop(drugs_df[drugs_df.NAME_TYPE != 'GENERIC_NAME'].index).reset_index(drop=True)
drugs_df.head()

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv


In [6]:
#!head -n 2 $DRUG_NAMES

In [7]:
'''
# Alternative names
alt_names = defaultdict(set)
with open(DRUG_NAMES, 'r') as f:
    header = True
    for line in f:
        if header:
            header = False
            continue
            
        line = line.lower().split(',')
        name = line[0]
        gen_name = line[1]
        
        alt_names[gen_name].add(name)
        
for k, v in alt_names.items():
    alt_names[k] = ' '.join(v)
'''

"\n# Alternative names\nalt_names = defaultdict(set)\nwith open(DRUG_NAMES, 'r') as f:\n    header = True\n    for line in f:\n        if header:\n            header = False\n            continue\n            \n        line = line.lower().split(',')\n        name = line[0]\n        gen_name = line[1]\n        \n        alt_names[gen_name].add(name)\n        \nfor k, v in alt_names.items():\n    alt_names[k] = ' '.join(v)\n"

In [8]:
#drugs_df['ALTNAMES'] = drugs_df.apply(lambda x: alt_names[x['GENERIC_NAME']], axis=1)
#drugs_df = drugs_df.rename(columns={'NAME_TYPE': 'NAMETYPE'})

In [9]:
drugs_df.head()

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv


In [10]:
drugs_df.shape

(360, 6)

## Stop words and common words

In [11]:
# Init tokenizer and nltk stopwords
tok = sm.AlphanumericTokenizer(return_set=True)
stop_words = set(stopwords.words('english')) 

In [12]:
# These overlap with some of the street names, removing
stop_words.add('drug')
stop_words.add('drugs')

In [13]:
# Common English words from Brown corpus
brown_counter = Counter()
brown_corpus = [w.lower() for w in brown.words()]
brown_words = [w for w in brown_corpus if w not in stop_words]
brown_counter.update(brown_words)

In [14]:
# Check to see which words are common
remove_count = []
for name in drugs_df['name'].unique():
    if name in brown_counter and brown_counter[name] >= 15:
        remove_count.append((name, brown_counter[name]))

In [15]:
sorted(remove_count, key = lambda x: x[1], reverse=True)

[('opium', 16)]

In [16]:
# Remove common words and stop words
# Keep opium
remove = []
for name in drugs_df['name'].unique():
    if name == 'opium':
        continue
    if (name in brown_counter and brown_counter[name] >= 15) or (name in stop_words):
        remove.append(name)

In [17]:
drugs_df = drugs_df[~drugs_df['name'].isin(remove)].copy().reset_index(drop=True)

In [18]:
drugs_df.shape

(360, 6)

# Process Reddit data

In [19]:
REDDIT_DATA = './data/usnewspaper-Jun-Aug_en_cleaned_relevantNews_v2.csv'

In [20]:
# reddit_df = pd.read_excel(REDDIT_DATA)
reddit_df = pd.read_csv(REDDIT_DATA)

In [21]:
reddit_df.head()

Unnamed: 0,publishdate,src,title,news,src_name
0,2020-06-01,https://www.prnewswire.com/news-releases/,a natural partnership: popsockets & burt`s bees collaborate on a new product,popgrip lips x burt`s bees will provide consumers with a two-in-one product that features the cl...,prnewswire
1,2020-06-01,http://www.reuters.com/,keep your distance: people queue for school and ikea in england,"warrington, england (reuters) - thousands of people across england queued up for school and ikea...",reuters
2,2020-06-01,http://www.aljazeera.com/,mapping us cities where george floyd protests have erupted,"demonstrations have erupted in dozens of us cities after george floyd, an unarmed black man, die...",aljazeera
3,2020-06-01,https://www.dallasnews.com/,enjoy the dallas museum of art's exploration of home from the comfort of yours,"what does it mean for a house to be a home? for one thing, it means that a structure built of in...",dallasnews
4,2020-06-01,https://www.dallasnews.com/,"target opens some dallas stores as protesters move overnight, causing damage to west end and upt...",the protests that have spread around the country are closing stores at a time when retailers are...,dallasnews


In [22]:
reddit_df = reddit_df.rename(columns={'id': 'reddit_id'})

In [23]:
reddit_df.shape

(293627, 5)

In [24]:
# Remove unpopular posts
#reddit_df = reddit_df[reddit_df['score'] > 0].copy().reset_index(drop=True)
#reddit_df.shape

## Stop words

In [25]:
def process_text(text, tok, stop_words, join=False):
    text = text.strip().lower().replace('\\n\\n', ' ')
    result = [w for w in tok.tokenize(text) if w not in stop_words]
    if join:
        result = ' '.join(result)
    return result

In [26]:
# Tokenize and remove stopwords from selftext and titles
reddit_df['tokTitle'] = reddit_df.apply(lambda x: process_text(x['title'], tok, stop_words), axis=1)
reddit_df['tokNews'] = reddit_df.apply(lambda x: process_text(x['news'], tok, stop_words), axis=1)

In [27]:
# Just to check the frequency of words in reddit text
reddit_counter = Counter()
reddit_df['tokTitle'].apply(reddit_counter.update)
reddit_df['tokNews'].apply(reddit_counter.update)
#print(reddit_counter)

0         None
1         None
2         None
3         None
4         None
          ... 
293622    None
293623    None
293624    None
293625    None
293626    None
Name: tokNews, Length: 293627, dtype: object

In [28]:
# Join text
reddit_df['tokTitle'] = reddit_df.apply(lambda x: process_text(x['title'], tok, stop_words, join=True), axis=1)
reddit_df['tokNews'] = reddit_df.apply(lambda x: process_text(x['news'], tok, stop_words, join=True), axis=1)

# Entity Matching - drugs_df

name     name_type	full_generic_name	category	fda_schedule

In [29]:
l_output_attrs = ['name', 'category', 'fda_schedule']
r_output_attrs = ['tokTitle', 'tokNews']

In [30]:
A = drugs_df
B = reddit_df

In [31]:
A['id'] = A.index
B['id'] = B.index

In [32]:
em.set_key(A, 'id')
em.set_key(B, 'id')

True

In [33]:
#A['ALTNAMES'] = A['ALTNAMES'].astype(str)

In [34]:
atypes_A = em.get_attr_types(A)
atypes_B = em.get_attr_types(B)

  if returned_type == bool or returned_type == pd.np.bool_:


In [35]:
[(col, atypes_A[col]) for col in A.columns]

[('id', 'numeric'),
 ('name', 'str_bt_1w_5w'),
 ('name_type', 'str_eq_1w'),
 ('full_generic_name', 'str_bt_1w_5w'),
 ('category', 'str_bt_1w_5w'),
 ('fda_schedule', 'str_bt_1w_5w')]

In [36]:
[(col, atypes_B[col]) for col in B.columns]

[('publishdate', 'str_eq_1w'),
 ('src', 'str_eq_1w'),
 ('title', 'str_gt_10w'),
 ('news', 'str_gt_10w'),
 ('src_name', 'str_eq_1w'),
 ('tokTitle', 'str_bt_5w_10w'),
 ('tokNews', 'str_gt_10w'),
 ('id', 'numeric')]

In [37]:
em.get_attr_corres(A, B)['corres']

[('id', 'id')]

In [38]:
atypes_A['name'] = atypes_B['tokNews']
#atypes_A['GENERIC_NAME'] = atypes_B['tokNews']
atypes_B['tokTitle'] = atypes_B['tokNews']

In [39]:
block_c = em.get_attr_corres(A, B)
block_c['corres'] = [('name', 'tokTitle'), ('name', 'tokNews')]

In [40]:
toks = em.get_tokenizers_for_blocking()
toks

{'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>,
 'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>,
 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>,
 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>}

In [41]:
sim = em.get_sim_funs_for_blocking()
sim

{'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'cosine'

In [42]:
block_f = em.get_features(A, B, atypes_A, atypes_B, block_c, toks, sim)

In [43]:
block_f

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,name_tokTitle_jac_qgm_3_qgm_3,name,tokTitle,qgm_3,qgm_3,jaccard,<function name_tokTitle_jac_qgm_3_qgm_3 at 0x000002280FE12828>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,name_tokTitle_cos_dlm_dc0_dlm_dc0,name,tokTitle,dlm_dc0,dlm_dc0,cosine,<function name_tokTitle_cos_dlm_dc0_dlm_dc0 at 0x000002280FE12B88>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,name_tokNews_jac_qgm_3_qgm_3,name,tokNews,qgm_3,qgm_3,jaccard,<function name_tokNews_jac_qgm_3_qgm_3 at 0x000002280FE12CA8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,name_tokNews_cos_dlm_dc0_dlm_dc0,name,tokNews,dlm_dc0,dlm_dc0,cosine,<function name_tokNews_cos_dlm_dc0_dlm_dc0 at 0x000002280FE12DC8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [44]:
# Black box function that checks if NAME is exactly in the reddit text
def street_name_titleselftext_exact(x, y):
    x = x['name']
    y = y['tokTitle'] + ' ' + y['tokNews']
    return not x in y

In [45]:
# Blocker to match NAMES and reddit text
# This blocker gives us all of the candidate relationships, which are a lot since there may be multiple 
# relationships with a single Reddit post
bb = em.BlackBoxBlocker()
bb.set_black_box_function(street_name_titleselftext_exact)

In [46]:
C1 = bb.block_tables(A, B, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, show_progress=True,
                     n_jobs=-1)

  l_splits = pd.np.array_split(l_df, m)
  r_splits = pd.np.array_split(r_df, n)


In [47]:
C1

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_category,ltable_fda_schedule,rtable_tokTitle,rtable_tokNews
0,0,0,4956,fentanyl,opioid analgesics,schedule ii,cpj 100 press freedom violations reported us protests,media watchdog called us authorities stop targeting journalists covering protests death unarmed ...
1,1,0,5112,fentanyl,opioid analgesics,schedule ii,medical examiner private autopsy find george floyd death homicide,minneapolis medical examiner monday classified george floyd death homicide saying heart stopped ...
2,2,0,5604,fentanyl,opioid analgesics,schedule ii,george floyd died asphyxia family medical examiner autopsy,george floyd killed asphyxia due neck back compression according independent medical examiner re...
3,3,0,5862,fentanyl,opioid analgesics,schedule ii,ex boxer floyd mayweather paying george funeral services per reports,tmz sports published photo 88 500 check mayweather houston area funeral provider fort bend memor...
4,4,0,6064,fentanyl,opioid analgesics,schedule ii,medical examiner floyd heart stopped restrained,significant conditions said floyd suffered heart disease hypertension listed fentanyl intoxicati...
...,...,...,...,...,...,...,...,...
25146,25146,358,253735,methadone,opioid analgesics,schedule ii,covid 19 pandemic could worsening addictions overdoses,covid 19 pandemic disrupted routines vast majority americans left many isolated alone battling a...
25147,25147,358,261982,methadone,opioid analgesics,schedule ii,former doctor pleads guilty illegally prescribing pain pills officials say,former doctor amityville pleaded guilty monday illegally prescribing thousands highly addictive ...
25148,25148,358,262788,methadone,opioid analgesics,schedule ii,hotel lucerne upper west side converts temporary homeless shelter,sign special edition newsletter get daily update coronavirus pandemic normally live let upper we...
25149,25149,358,289991,methadone,opioid analgesics,schedule ii,people color cited often skipping fares mbta,response globe public records request transit police provided two months worth fare evasion cita...


 # Save C1 to csv

In [48]:
C1.to_csv('C1_output.csv',index=False)
#C1 = pd.read_csv('C1_output.csv')

In [49]:
# Check to see what didn't get matched
missing_reddit = ~reddit_df['id'].isin(C1['rtable_id'])

In [50]:
reddit_df[missing_reddit]['tokNews']

0         popgrip lips x burt bees provide consumers two one product features classic functionality combin...
1         warrington england reuters thousands people across queued school ikea monday british government ...
2         demonstrations erupted dozens us cities george floyd unarmed black man died police custody thous...
5         canyon ranch bills place guests get away stress modern life days includes global pandemic shutte...
6         past week protesters converged city streets across country including dallas voice anger death ge...
                                                         ...                                                 
293621    oakdale minn aug 3 2020 prnewswire st croix hospice announced today mandy cogswell rn chief clin...
293623    fifth year wards 10 best ux competition focuses user experience driver interacts vehicle well ea...
293624    atlanta aug 2 2020 prnewswire tropical storm isaias tracks along southeastern coastline includin...
293625    

In [68]:
# Blocker to get a subset of the above for manual labeling
rb = em.RuleBasedBlocker()
rb.add_rule(['name_tokNews_cos_dlm_dc0_dlm_dc0 	(ltuple, rtuple) < 0.1'], block_f)

'_rule_0'

In [69]:
C2 = rb.block_candset(C1, show_progress=True, n_jobs=-1)

  c_splits = pd.np.array_split(c_df, n_procs)


In [70]:
# C2.to_csv('C2_output.csv', index=False)
# C2 = pd.read_csv_metadata('C2_output.csv')

# Label Block

In [71]:
# # Generate manually labeled data set

# G = em.label_table(C2, 'gold_label')
# G.to_csv('./C2_labeled_news.csv', index=False)

Column name (gold_label) is not present in dataframe


In [68]:
# !pip install PyQt5

In [None]:
#G = em.read_csv_metadata('./C2_labeled.csv', key='_id',
#                      fk_ltable='ltable_id', fk_rtable='rtable_id',
#                      ltable=A, rtable=B)

In [None]:
# I needed to add features that weren't included in the labeled data set

#G.merge(A[['id', 'ALTNAMES']], how='left', left_on='ltable_id', right_on='id').drop(columns=['id']).to_csv('./C2_labeled_2.csv', index=False)
#G.merge(A[['id', 'NAMETYPE']], how='left', left_on='ltable_id', right_on='id').drop(columns=['id']).to_csv('./C2_labeled_3.csv', index=False)

In [72]:
# Final labeled set with added features
G = em.read_csv_metadata('./C2_labeled_news.csv', key='_id',
                      fk_ltable='ltable_id', fk_rtable='rtable_id',
                      ltable=A, rtable=B)

Metadata file is not present in the given path; proceeding to read the csv file.


# Matching

In [73]:
# Split data into train and test
IJ = em.split_train_test(G, train_proportion=0.6, random_state=0)
I = IJ['train']
J = IJ['test']

  idx_values = pd.np.array(labeled_data.index.values)


In [74]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [53]:
# A = A.rename(columns={'NAME_TYPE': 'NAMETYPE'})

In [76]:
A.columns

Index(['id', 'name', 'name_type', 'full_generic_name', 'category',
       'fda_schedule'],
      dtype='object')

In [77]:
B.columns

Index(['publishdate', 'src', 'title', 'news', 'src_name', 'tokTitle',
       'tokNews', 'id'],
      dtype='object')

In [78]:
A = A.drop(columns=['full_generic_name', 'category','fda_schedule'])
B = B.drop(columns=['src', 'title', 'news', 'src_name'])

In [79]:
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

  if returned_type == bool or returned_type == pd.np.bool_:


In [80]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x000002281B231CA8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x0000022815F13708>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,id_id_lev_dist,id,id,,,lev_dist,<function id_id_lev_dist at 0x0000022815F131F8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,id_id_lev_sim,id,id,,,lev_sim,<function id_id_lev_sim at 0x0000022815F13828>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


## Generate Features

In [59]:
# Matching NAME to the Reddit text using string and token similarity doesn't really make sense since the
# Reddit text is long while the names are 1-2 words. There may be some value in similarity scores between all
# of the ALTNAMES and the Reddit text, especially in cases where multiple alt words are used in the text. However,
# the length of a post or a short list of alt names may arbitrarily lower these scores.

In [81]:
feature_string = '''dice(wspace(ltuple['name']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokNews'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'jac_ws_name_tokTitleNews', feature)

True

In [82]:
feature_string = '''dice(wspace(ltuple['name']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokNews'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'dc_ws_name_tokTitleNews', feature)

True

In [83]:
feature_string = '''cosine(wspace(ltuple['name']),
                                wspace((rtuple['tokTitle'] + " " + rtuple['tokNews'])))'''
feature = em.get_feature_fn(feature_string, sim, toks)
em.add_feature(F, 'cos_ws_name_tokTitleNews', feature)

True

In [84]:
# NAME in Reddit text
def street_name_titleselftext_exact_feature(x, y):
    x = x['name']
    y = y['tokTitle'] + ' ' + y['tokNews']
    return int(x in y)

In [85]:
em.add_blackbox_feature(F, 'street_name_present', street_name_titleselftext_exact_feature)

True

In [86]:
# Number of alt names in Reddit text
def num_alt_matches(x, y):
    x = set(x['name'].split(' '))
    y = set(y['tokTitle'].split(' ')).union(set(y['tokNews'].split(' ')))
    return len(x.intersection(y))

In [87]:
em.add_blackbox_feature(F, 'num_alt_matches', num_alt_matches)

True

In [67]:
# # Name type is not street name (to give weight to brand and generic names)
# def not_street_name(x, y):
#     return int(not x['NAMETYPE'] == 'STREET_NAME')

In [68]:
# em.add_blackbox_feature(F, 'not_street_name', not_street_name)

True

In [69]:
# # Generic name is in Reddit text
# def generic_in_text(x, y):
#     x = x['GENERIC_NAME']
#     y = y['tokTitle'] + ' ' + y['tokSelftext']
#     return int(x in y)

In [70]:
# em.add_blackbox_feature(F, 'generic_in_text', generic_in_text)

True

In [88]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,id_id_exm,id,id,,,exact_match,<function id_id_exm at 0x000002281B231CA8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,id_id_anm,id,id,,,abs_norm,<function id_id_anm at 0x0000022815F13708>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,id_id_lev_dist,id,id,,,lev_dist,<function id_id_lev_dist at 0x0000022815F131F8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,id_id_lev_sim,id,id,,,lev_sim,<function id_id_lev_sim at 0x0000022815F13828>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,jac_ws_name_tokTitleNews,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x000002281B231168>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['name']),\n ...",False
5,dc_ws_name_tokTitleNews,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x00000228179C25E8>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['name']),\n ...",False
6,cos_ws_name_tokTitleNews,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x00000228190A40D8>,"def fn(ltuple, rtuple):\n return cosine(wspace(ltuple['name']),\n ...",False
7,street_name_present,,,,,,<function street_name_titleselftext_exact_feature at 0x000002281908A9D8>,,False
8,num_alt_matches,,,,,,<function num_alt_matches at 0x00000228147DF1F8>,,False


In [89]:
# Remove id based features
F = F[~F['feature_name'].str.contains('id')].copy().reset_index(drop=True)

In [90]:
F

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,jac_ws_name_tokTitleNews,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x000002281B231168>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['name']),\n ...",False
1,dc_ws_name_tokTitleNews,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x00000228179C25E8>,"def fn(ltuple, rtuple):\n return dice(wspace(ltuple['name']),\n ...",False
2,cos_ws_name_tokTitleNews,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,PARSE_EXP,<function fn at 0x00000228190A40D8>,"def fn(ltuple, rtuple):\n return cosine(wspace(ltuple['name']),\n ...",False
3,street_name_present,,,,,,<function street_name_titleselftext_exact_feature at 0x000002281908A9D8>,,False
4,num_alt_matches,,,,,,<function num_alt_matches at 0x00000228147DF1F8>,,False


## Select Matcher

In [91]:
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_label',
                            show_progress=False)

  c_splits = pd.np.array_split(candset, n_procs)


In [92]:
H.head()

Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_name_tokTitleNews,dc_ws_name_tokTitleNews,cos_ws_name_tokTitleNews,street_name_present,num_alt_matches,gold_label
310,9915,315,119760,0.020619,0.020619,0.102062,1,1,0
176,2936,13,254800,0.022727,0.022727,0.107211,1,1,1
390,20194,340,203624,0.042105,0.042105,0.146647,1,2,1
116,2082,13,208380,0.019417,0.019417,0.099015,1,1,1
168,2738,1,251089,0.022727,0.022727,0.107211,1,1,0


In [93]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
        k=5,
        target_attr='gold_label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

  val_list.append(pd.np.mean(scores))
  mean_score_list.append(pd.np.mean(scores))
  if met == metric_to_select_matcher and pd.np.mean(scores) > max_score:
  max_score = pd.np.mean(scores)


Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.833811,0.832909,0.82841
1,RF,0.817599,0.880388,0.843995
2,SVM,0.763636,1.0,0.863998
3,LinReg,0.769032,0.985899,0.862365
4,LogReg,0.763636,1.0,0.863998


In [94]:
result['drill_down_cv_stats']['precision']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002293A5249C8>,5,0.883721,0.864865,0.72549,0.857143,0.837838,0.833811
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002293A524B08>,5,0.888889,0.822222,0.711538,0.860465,0.804878,0.817599
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002293A524488>,5,0.854545,0.763636,0.690909,0.836364,0.672727,0.763636
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002293A5244C8>,5,0.854545,0.763636,0.690909,0.830189,0.705882,0.769032
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002293A524A88>,5,0.854545,0.763636,0.690909,0.836364,0.672727,0.763636


In [95]:
result['drill_down_cv_stats']['recall']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002293A5249C8>,5,0.808511,0.761905,0.973684,0.782609,0.837838,0.832909
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002293A524B08>,5,0.851064,0.880952,0.973684,0.804348,0.891892,0.880388
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002293A524488>,5,1.0,1.0,1.0,1.0,1.0,1.0
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002293A5244C8>,5,1.0,1.0,1.0,0.956522,0.972973,0.985899
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002293A524A88>,5,1.0,1.0,1.0,1.0,1.0,1.0


In [96]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002293A5249C8>,5,0.844444,0.810127,0.831461,0.818182,0.837838,0.82841
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002293A524B08>,5,0.869565,0.850575,0.822222,0.831461,0.846154,0.843995
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002293A524488>,5,0.921569,0.865979,0.817204,0.910891,0.804348,0.863998
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002293A5244C8>,5,0.921569,0.865979,0.817204,0.888889,0.818182,0.862365
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002293A524A88>,5,0.921569,0.865979,0.817204,0.910891,0.804348,0.863998


## Train and Test Matcher

In [80]:
# RandomForest did slightly better with Precision, and I think we're more interested in removing FPs than
# maximizing TPs

In [99]:
rf.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
        target_attr='gold_label')



In [100]:
#import pickle

#with open('./rf_matcher.pkl', 'wb') as f:
#    pickle.dump(rf, f)

In [101]:
# Test
test = em.extract_feature_vecs(J, 
                               feature_table=F, 
                               attrs_after='gold_label',
                               show_progress=False)

  c_splits = pd.np.array_split(candset, n_procs)


In [102]:
# Predict on Test
rf.predict(table=test, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_label'],
            target_attr='prediction', append=True, inplace=True)


Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_name_tokTitleNews,dc_ws_name_tokTitleNews,cos_ws_name_tokTitleNews,street_name_present,num_alt_matches,gold_label,prediction
164,2675,1,236477,0.052632,0.052632,0.164399,1,1,0,1
229,3432,248,31570,0.020000,0.020000,0.100504,1,1,1,0
311,9917,315,120031,0.025974,0.025974,0.114708,1,1,0,1
221,3338,199,38406,0.020305,0.020305,0.101274,1,2,1,1
122,2257,108,176005,0.032787,0.032787,0.129099,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
30,633,117,44784,0.019802,0.019802,0.100000,1,1,1,1
114,2025,13,194724,0.023810,0.023810,0.109764,1,1,1,1
191,3168,108,255866,0.040000,0.040000,0.142857,1,1,1,0
103,1822,1,194351,0.026316,0.026316,0.115470,1,1,0,1


In [103]:
TP = ((test['gold_label'] == 1) & (test['prediction'] == 1)).sum()
FP = ((test['gold_label'] == 0) & (test['prediction'] == 1)).sum()
TN = ((test['gold_label'] == 0) & (test['prediction'] == 0)).sum()
FN = ((test['gold_label'] == 1) & (test['prediction'] == 0)).sum()
P = (test['gold_label'] == 1).sum()
N = (test['gold_label'] == 0).sum()

In [104]:
# Test Accuracy
(TP+TN)/(P+N)

0.6521739130434783

In [105]:
# Check what the matcher misclassified
G.set_index('_id').loc[test[test['gold_label'] != test['prediction']]['_id']]

Unnamed: 0_level_0,ltable_id,rtable_id,ltable_name,ltable_category,ltable_fda_schedule,rtable_tokTitle,rtable_tokNews,gold_label
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2675,1,236477,coca,opioid analgesics,schedule ii,always coca cola right,coca cola slammed coronavirus pandemic remains far clear quickly get back track beverage giant s...,0
3432,248,31570,methamphetamine,amphetamines,schedule ii,feds link 2018 bay area meth heroin bust sinaloa cartel,san francisco first time federal authorities publicly linked 2018 investigation yielded 37 pound...,1
9917,315,120031,cannabis,unknown,schedule i,3 reasons dump shares hexo today,company coming third quarter performance impressive canadian cannabis hexo nyse performing well ...,0
3579,315,29159,cannabis,unknown,schedule i,5 pot stocks posted profits recent earnings,every company cannabis industry struggles stay black many investors come expect losses companies...,0
1821,1,194345,coca,opioid analgesics,schedule ii,devious fossil fuel propaganda use,moving forward requires focus mashable social good series dedicated exploring pathways greater s...,0
...,...,...,...,...,...,...,...,...
20379,313,246472,opium,antidiarrheal agents,schedule ii,birmingham man held 570 000 bond lengthy probe shelby county,lengthy investigation shelby county landed birmingham man behind bars enforcement task force thu...,1
15397,315,176465,cannabis,unknown,schedule i,facing litigation state temporarily halts enforcement marijuana business rules,pending litigation oklahoma attorney general office temporarily agreed state enforce medical mar...,1
3466,248,55937,methamphetamine,amphetamines,schedule ii,pomona woman arrested claremont loaded gun ammo methamphetamine,claremont 30 year old pomona woman bail felony evading illegal firearms possession custody frida...,1
3168,108,255866,buprenorphine,opioid analgesics,schedule iii,teva benign opioid settlement bag yet,investors surprised familiar risk teva pharmaceutical industries soon resurfaces last october ag...,1


## Match Full Blocked Dataset

In [106]:
real = em.extract_feature_vecs(C1, 
                               feature_table=F, 
                               show_progress=False)

  c_splits = pd.np.array_split(candset, n_procs)


In [107]:
rf.predict(table=real, exclude_attrs=['_id', 'ltable_id', 'rtable_id'],
            target_attr='prediction', return_probs=True, probs_attr='proba', append=True, inplace=True)
real.head()

Unnamed: 0,_id,ltable_id,rtable_id,jac_ws_name_tokTitleNews,dc_ws_name_tokTitleNews,cos_ws_name_tokTitleNews,street_name_present,num_alt_matches,prediction,proba
0,0,0,4956,0.007937,0.007937,0.063119,1,1,1,1.0
1,1,0,5112,0.006349,0.006349,0.056433,1,1,1,1.0
2,2,0,5604,0.00995,0.00995,0.070711,1,1,1,1.0
3,3,0,5862,0.008403,0.008403,0.064957,1,1,1,1.0
4,4,0,6064,0.006369,0.006369,0.056523,1,1,1,1.0


# Create Datasets for Cypher

## Generic-Reddit Relationship Table

In [113]:
# Create relationship table
rel_table = C1.merge(real[['_id', 'proba', 'prediction']], how='left', left_on='_id', right_on='_id')

In [114]:
final_rel_table = rel_table[rel_table['prediction'] == 1]
final_rel_table = final_rel_table.rename(columns={'ltable_id': 'id_names', 'rtable_id': 'id_news'})
final_rel_table = final_rel_table[['id_news', 'id_names', 'proba']]

In [116]:
final_rel_table.to_csv('./import_cypher/news_relations.csv', index=False)

## Final drug names

In [93]:
final_drugs_df = drugs_df.copy()
final_drugs_df = final_drugs_df.rename(columns={'NAMETYPE': 'name_type', 'NAME': 'name', 
                                                'FULL_GENERIC_NAME': 'full_generic_name',
                                                'CATEGORY': 'category', 'FDA_SCHEDULE': 'fda_schedule'})
final_drugs_df = final_drugs_df[['id', 'name', 'name_type', 'full_generic_name', 'category', 'fda_schedule']]
final_drugs_df['name_type'] = final_drugs_df['name_type'].str.lower()
final_drugs_df['full_generic_name'] = final_drugs_df['full_generic_name'].str.lower()
final_drugs_df['category'] = final_drugs_df['category'].str.lower()
final_drugs_df['category'] = final_drugs_df['category'].fillna('unknown')
final_drugs_df['fda_schedule'] = final_drugs_df['fda_schedule'].str.lower()
final_drugs_df['fda_schedule'] = final_drugs_df['fda_schedule'].fillna('unknown')

In [94]:
final_drugs_df.to_csv('./import_cypher/drug_names.csv', index=False)

In [95]:
final_drugs_df

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,apache,street_name,fentanyl,opioid analgesics,schedule ii
2,2,birria,street_name,fentanyl,opioid analgesics,schedule ii
3,3,coca,generic_name,coca,opioid analgesics,schedule ii
4,4,blue diamond,street_name,fentanyl,opioid analgesics,schedule ii
...,...,...,...,...,...,...
2370,2370,steroids,street_name,testosterone,"hormones/synthetics/modifiers,other",schedule iii
2371,2371,hydromorphone,generic_name,hydromorphone,opioid analgesics,schedule ii
2372,2372,oxymorphone,generic_name,oxymorphone hydrochloride extended-release tablets,opioid analgesics,schedule ii
2373,2373,methadone,generic_name,methadone hydrochloride tablets,opioid analgesics,schedule ii


## Generic Names

In [96]:
generic_names_df = final_drugs_df[final_drugs_df['name_type'] == 'generic_name'].drop_duplicates(subset=['name']).copy()
generic_names_df = generic_names_df.reset_index(drop=True)
generic_names_df['id'] = generic_names_df.index

In [97]:
generic_names_df.to_csv('./import_cypher/generic_names.csv', index=False)

In [98]:
generic_names_df

Unnamed: 0,id,name,name_type,full_generic_name,category,fda_schedule
0,0,fentanyl,generic_name,fentanyl,opioid analgesics,schedule ii
1,1,coca,generic_name,coca,opioid analgesics,schedule ii
2,2,laudanum,generic_name,laudanum,opioid analgesics,schedule ii
3,3,sodium thiopental,generic_name,sodium thiopental,opioid analgesics,schedule iii
4,4,cathine,generic_name,cathine,opioid analgesics,schedule iv
...,...,...,...,...,...,...
355,355,testosterone,generic_name,testosterone,"hormones/synthetics/modifiers,other",schedule iii
356,356,hydromorphone,generic_name,hydromorphone,opioid analgesics,schedule ii
357,357,oxymorphone,generic_name,oxymorphone hydrochloride extended-release tablets,opioid analgesics,schedule ii
358,358,methadone,generic_name,methadone hydrochloride tablets,opioid analgesics,schedule ii


## Names

In [99]:
alt_names_df = final_drugs_df[['id', 'name']].copy()

In [100]:
alt_names_df.to_csv('./import_cypher/alt_names.csv', index=False)

## Generic Names-Names Relationship Table

In [101]:
gen_alt_rel_table = drugs_df.merge(generic_names_df[['id','name']].rename(columns={'id': 'id_generic'}), 
                                     how='left', left_on='GENERIC_NAME', right_on='name')
gen_alt_rel_table = gen_alt_rel_table.rename(columns={'id': 'id_alt'})
gen_alt_rel_table = gen_alt_rel_table[['id_alt', 'id_generic']]
gen_alt_rel_table['id_generic'] = gen_alt_rel_table['id_generic'].astype(int)

In [109]:
gen_alt_rel_table.to_csv('./import_cypher/gen_alt_rel_table.csv', index=False)

# Final Reddit

In [104]:
from datetime import datetime

In [115]:
final_reddit = reddit_df.copy()

In [117]:
final_reddit.head()

Unnamed: 0,publishdate,src,title,news,src_name,tokTitle,tokNews,id
0,2020-06-01,https://www.prnewswire.com/news-releases/,a natural partnership: popsockets & burt`s bees collaborate on a new product,popgrip lips x burt`s bees will provide consumers with a two-in-one product that features the cl...,prnewswire,natural partnership popsockets burt bees collaborate new product,popgrip lips x burt bees provide consumers two one product features classic functionality combin...,0
1,2020-06-01,http://www.reuters.com/,keep your distance: people queue for school and ikea in england,"warrington, england (reuters) - thousands of people across england queued up for school and ikea...",reuters,keep distance people queue school ikea england,warrington england reuters thousands people across queued school ikea monday british government ...,1
2,2020-06-01,http://www.aljazeera.com/,mapping us cities where george floyd protests have erupted,"demonstrations have erupted in dozens of us cities after george floyd, an unarmed black man, die...",aljazeera,mapping us cities george floyd protests erupted,demonstrations erupted dozens us cities george floyd unarmed black man died police custody thous...,2
3,2020-06-01,https://www.dallasnews.com/,enjoy the dallas museum of art's exploration of home from the comfort of yours,"what does it mean for a house to be a home? for one thing, it means that a structure built of in...",dallasnews,enjoy dallas museum art exploration home comfort,mean house home one thing means structure built inanimate wood glass somehow become anchor power...,3
4,2020-06-01,https://www.dallasnews.com/,"target opens some dallas stores as protesters move overnight, causing damage to west end and upt...",the protests that have spread around the country are closing stores at a time when retailers are...,dallasnews,target opens dallas stores protesters move overnight causing damage west end uptown businesses,protests spread around country closing stores time retailers still trying operate post pandemic ...,4


In [125]:
from datetime import datetime
final_reddit = reddit_df.copy()
# final_reddit['publishdate'] = final_reddit.apply(lambda x: datetime.(x['publishdate']).strftime('%Y-%m-%d'),axis=1)
# final_reddit['link_flair_text'] = final_reddit['link_flair_text'].fillna('none')

final_reddit['id'] = final_reddit.index
final_reddit = final_reddit[['id', 'title', 'publishdate', 'src_name']]

In [126]:
final_reddit

Unnamed: 0,id,title,publishdate,src_name
0,0,a natural partnership: popsockets & burt`s bees collaborate on a new product,2020-06-01,prnewswire
1,1,keep your distance: people queue for school and ikea in england,2020-06-01,reuters
2,2,mapping us cities where george floyd protests have erupted,2020-06-01,aljazeera
3,3,enjoy the dallas museum of art's exploration of home from the comfort of yours,2020-06-01,dallasnews
4,4,"target opens some dallas stores as protesters move overnight, causing damage to west end and upt...",2020-06-01,dallasnews
...,...,...,...,...
293622,293622,hurricane isaias: u-haul offers 30 days free storage in 3 states,2020-08-03,prnewswire
293623,293623,2021 kia seltos named to wards 10 best user experience (ux) list,2020-08-03,prnewswire
293624,293624,georgia power prepares for tropical storm isaias,2020-08-03,prnewswire
293625,293625,inventhelp inventor presents a fencing component (roh-716),2020-08-03,prnewswire


In [131]:
final_reddit.isnull().sum()

id             0
title          0
publishdate    0
src_name       0
dtype: int64

In [127]:
final_reddit.to_csv('./import_cypher/usnews.csv', index=False)