# Import tools and data

### Import packages

In [1]:
import pandas as pd
import glob
import numpy as np
import json
import sqlite3
import re
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

### Import utilities

In [2]:
import sys
sys.path.append('../src/')
import utils

z_recode_stems = utils.z_recode_stems
Recode_z = utils.Recode_z
most_common = utils.most_common

### Import data

In [3]:
#master data for reference
conn = sqlite3.connect("../data/db/gfm.db")
master = pd.read_sql_query("SELECT url,fund_description FROM master", conn)

In [4]:
#disease hierarchy
ccsr = pd.read_csv('../data/ccsr/disease_hierarchy.csv')

In [5]:
#categorization outout
path = '../data/disease_categorization/output_new' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename)
    li.append(df)

er = pd.concat(li, axis=0, ignore_index=True)

In [6]:
#categorization output deidentification map
deid = pd.read_csv('../data/disease_categorization/url_deid_map.csv')

In [7]:
#categorization output, additional campaigns
more = pd.read_csv('../data/disease_categorization/more_for_spark_output.csv')

In [8]:
#manually labeled reference
#ref = pd.read_csv('../data/disease_categorization/evaluation/reference_set_any_pivot.csv')
ref = pd.read_csv('../data/disease_categorization/evaluation/final_reference.csv')

# Clean data

### Clean categorization output

#### Re-identify campaigns

In [9]:
deid['url_deid'] = deid['url_deid'].astype(int)
er['url'] = er['url'].astype(int)
er = er.merge(deid, left_on='url', right_on='url_deid', how='left')
er = er[['url_y','chunks','begin','end','sent','code','results','resolutions','res_distances']]
er = er.rename(columns = {'url_y':'url', 'code':'entity_code'})

#### Add additional campaigns

In [10]:
more.rename(columns={'code':'entity_code'},inplace=True)

In [11]:
er = pd.concat([er, more], axis=0)

### Were any campaigns from master data not subjected to disease categorization?

In [12]:
more_input = pd.read_csv('../data/disease_categorization/more_for_spark.csv')

In [13]:
urls = deid['url'].tolist() + more_input['url'].tolist()

In [15]:
#takes a little while
for url in master['url']:
    if url in urls:
        continue
    else:
        print(url)
print('all done')

all done


All campaigns from master data were subjected to disease categorization

### Were any campaigns from reference data not subjected to disease categorization

In [16]:
for url in ref['url']:
    if url in urls:
        continue
    else:
        print(url)
print('all done')

https://www.gofundme.com/f/evp227-fernando-diaz
all done


One campaign is not included in reference data.

### Subset ER data

In [17]:
er = er[er['url'].isin(master['url'].tolist())]

### Inspect data

In [18]:
er.head()

Unnamed: 0,url,chunks,begin,end,sent,entity_code,results,resolutions,res_distances
0,https://www.gofundme.com/f/sandi-rustad,Metaplastic breast cancer,33.0,57.0,0,C5091,C5091:::C5092:::C509:::C5061:::C4352:::C5041::...,metaplastic carcinoma of breast:::metaplastic ...,5.5764:::6.6196:::7.2053:::7.4696:::7.4981:::7...
1,https://www.gofundme.com/f/sandi-rustad,cancer,103.0,108.0,1,C801,C801:::C569:::Z809:::C809:::C800:::D099:::Z859...,cancer:::generalized cancer:::fh: cancer - *::...,0.0000:::7.5123:::7.7614:::8.1281:::8.1928:::8...
2,https://www.gofundme.com/f/lktjl0,brain cancer,44.0,55.0,0,D496,D496:::C710:::C793:::D432:::C718:::C719:::C717...,brain tumor:::carcinoma of brain:::secondary c...,7.1362:::7.1587:::7.8662:::7.9386:::8.8884:::9...
3,https://www.gofundme.com/f/lktjl0,Cancer,349.0,354.0,5,C801,C801:::Z809:::Z859:::C800:::C569:::Z129:::C809...,cancer:::fh: cancer - *:::h/o: cancer:::ca - m...,5.4193:::7.1018:::8.0088:::8.3601:::8.4044:::9...
4,https://www.gofundme.com/f/lktjl0,cancer,479.0,484.0,7,C801,C801:::C569:::Z809:::C809:::C800:::D099:::Z859...,cancer:::generalized cancer:::fh: cancer - *::...,0.0000:::7.5123:::7.7614:::8.1281:::8.1928:::8...


In [19]:
er.shape

(272722, 9)

In [20]:
unique_codes_n = er['entity_code'].nunique()
print(unique_codes_n)

6555


In [21]:
unique_urls_n = er['url'].nunique()
print(unique_urls_n)

74256


### Recode "family/personal history of" codes (start with Z80 to Z87) and recode to second code in results

In [22]:
# #takes a little while
# recode_fh_map = Recode_z(er, z_recode_stems)

In [23]:
# #find the most common remapping and use that
# recode_fh_map_lofd = pd.DataFrame(recode_fh_map).groupby('code').agg(lambda x: most_common(list(x))).reset_index().to_dict(orient='records')

# recode_fh_map_dict = {}
# for d in recode_fh_map_lofd:
#     recode_fh_map_dict[d['code']] = d['res']

In [24]:
# z_to_recode = [k for k,v in recode_fh_map_dict.items()]
# unique_z_recoded_n = pd.Series(er['entity_code'].unique()).isin(z_to_recode).sum()
# print(unique_z_recoded_n)
# print(unique_z_recoded_n/unique_codes_n)

In [25]:
# #replace values
# er['entity_code'].replace(recode_fh_map_dict, inplace=True)

### Recode autism

"resolutions" column is correct, but the ICD code does not seem correct

In [26]:
er[er['chunks'].str.contains('autism')].head()

Unnamed: 0,url,chunks,begin,end,sent,entity_code,results,resolutions,res_distances
1308,https://www.gofundme.com/f/6u7abs,autism,704.0,709.0,7,G4090,G4090:::Z134:::Q9389:::F849:::R4189:::E7119:::...,autism:::suspected autism:::autism disorder:::...,0.0000:::6.6234:::6.8922:::8.0960:::8.4410:::8...
4524,https://www.gofundme.com/f/JPHaigler,autism,583.0,588.0,7,G4090,G4090:::Z134:::Q9389:::F849:::R4189:::E7119:::...,autism:::suspected autism:::autism disorder:::...,0.0000:::6.6234:::6.8922:::8.0960:::8.4410:::8...
6833,https://www.gofundme.com/f/jane-brogan,autism,1056.0,1061.0,11,G4090,G4090:::Z134:::Q9389:::F849:::R4189:::E7119:::...,autism:::suspected autism:::autism disorder:::...,0.0000:::6.6234:::6.8922:::8.0960:::8.4410:::8...
7020,https://www.gofundme.com/f/BrinleysBuddy,autism,775.0,780.0,11,G4090,G4090:::Z134:::Q9389:::F849:::R4189:::E7119:::...,autism:::suspected autism:::autism disorder:::...,0.0000:::6.6234:::6.8922:::8.0960:::8.4410:::8...
7869,https://www.gofundme.com/f/z44yr2ms,autism,3094.0,3099.0,32,G4090,G4090:::Z134:::Q9389:::F849:::R4189:::E7119:::...,autism:::suspected autism:::autism disorder:::...,0.0000:::6.6234:::6.8922:::8.0960:::8.4410:::8...


In [27]:
er.loc[er['chunks'].str.lower().str.contains('autism'),'entity_code'] = 'F840'
er.loc[er['chunks'].str.lower().str.contains('autistic'),'entity_code'] = 'F840'

### Match ICD codes from entity resolution with categories from CCSR

In [28]:
raw_codes = er['entity_code'].unique().tolist()

In [29]:
ccsr_codes = ccsr['icd_10_cm_code'].tolist()

In [30]:
def StemCode(code, ccsr):
    res = 0
    while res == 0:
        sub = ccsr[ccsr['icd_10_cm_code'].str.startswith(code)]
        res = sub.shape[0]
        if res > 0:
            cats = sub['ccsr_category'].value_counts().index[0]
            return code, cats
        else:
            code = code[:-1]
            if len(code) == 0:
                print('error - no code stem')
                return np.nan, np.nan

In [31]:
def GetCodeMatch(raw_codes, ccsr_codes, ccsr):

    #define results container for best match
    best_match = []

    #loop through raw codes
    for raw in raw_codes:
        #if raw code exists in CCSR as-is
        if raw in ccsr_codes:
            #no need to define best match as exact match already exists
            continue
        else:
            #see if there are any codes that start with code
            #if not, remove final character, try again
            #once codes are found, select the ccsr_cateogry that is most common
            matched_code, ccsr_cat = StemCode(raw, ccsr)
            best_match.append({'raw_code':raw, 'matched_code_stem':matched_code,'matched_ccsr_category':ccsr_cat})

    return best_match

In [32]:
#takes a little while
best_match_map = GetCodeMatch(raw_codes, ccsr_codes, ccsr)

error - no code stem
error - no code stem


In [33]:
best_match_map = pd.DataFrame(best_match_map).merge(ccsr[['ccsr_category','disease_category']].drop_duplicates('ccsr_category'), 
                                                    left_on='matched_ccsr_category', 
                                                    right_on='ccsr_category', 
                                                    how='left')

In [34]:
#inspect code n=2 stem errors
best_match_map[best_match_map['matched_code_stem'].isna()]

Unnamed: 0,raw_code,matched_code_stem,matched_ccsr_category,ccsr_category,disease_category
1668,d430,,,,
2285,q878,,,,


In [35]:
er[(er['entity_code'] == 'q878')|(er['entity_code'] == 'd430')]

Unnamed: 0,url,chunks,begin,end,sent,entity_code,results,resolutions,res_distances
77811,https://www.gofundme.com/f/2fubvk4,tumors on Linda's brain,2099.0,2121.0,34,d430,d430:::D331:::D432:::D497:::C710:::C719:::D239...,tumor of hypothalamus:::cerebellopontine angle...,9.7123:::9.8830:::9.9242:::9.9815:::10.0045:::...
108687,https://www.gofundme.com/f/qggaeey8,tumors of the hypothalamus,152.0,177.0,1,d430,d430:::C710:::D443:::D430:::D496:::C751:::D330...,tumor of hypothalamus:::malignant tumor of hyp...,3.0166:::4.8998:::5.0649:::5.2426:::5.5834:::6...
157591,https://www.gofundme.com/f/baby-delaynie,Zellweger spectrum disorders,709.0,736.0,5,q878,q878:::Q878:::E7151:::E7154:::E803:::Q777:::Q1...,zellweger syndrome (disorder):::zellweger's sy...,7.2023:::7.3898:::7.4975:::8.0355:::8.4334:::9...


In [36]:
#both appear to be a capitalization issues
er.loc[er['entity_code'] == 'q878','entity_code'] = 'Q878'
er.loc[er['entity_code'] == 'd430','entity_code'] = 'D430'

In [37]:
#rerun after correcting capitalization issue from Spark output
raw_codes = er['entity_code'].unique().tolist()
best_match_map = GetCodeMatch(raw_codes, ccsr_codes, ccsr)

No errors after running above cell

In [38]:
# best_match_map = pd.DataFrame(best_match_map).merge(ccsr[['ccsr_category','int_category','disease_category']].drop_duplicates('ccsr_category'), 
#                                                     left_on='matched_ccsr_category', 
#                                                     right_on='ccsr_category', 
#                                                     how='left')

best_match_map = pd.DataFrame(best_match_map).merge(ccsr[['ccsr_category','disease_category']].drop_duplicates('ccsr_category'), 
                                                    left_on='matched_ccsr_category', 
                                                    right_on='ccsr_category', 
                                                    how='left')

In [39]:
best_match_map.to_csv('../data/disease_categorization/best_match_map.csv', index=False)

In [40]:
unique_best_match_n = best_match_map.shape[0]
print(unique_best_match_n)
print(unique_best_match_n/unique_codes_n)

2864
0.43691838291380625


In [41]:
pd.Series([len(x) for x in best_match_map['matched_code_stem']]).value_counts()

5    1313
4    1169
3     290
2      75
1      17
dtype: int64

In [42]:
17/unique_best_match_n

0.005935754189944134

### Replace best-matched entity codes

In [43]:
#subset dataframe for those that need best matched data
er_for_best_match = er[er['entity_code'].isin(best_match_map['raw_code'].tolist())]
er_rest = er[~er['entity_code'].isin(er_for_best_match['entity_code'].tolist())]

### Merge ER data with appropriate disease categories

In [44]:
#best match
er_for_best_match = er_for_best_match.merge(best_match_map, left_on='entity_code', right_on='raw_code', how='left')
del er_for_best_match['matched_code_stem']
del er_for_best_match['matched_ccsr_category']
er_for_best_match.loc[:,'icd_10_cm_code_desc'] = ['best_match' for x in range(len(er_for_best_match))]
er_for_best_match.rename(columns={'raw_code':'icd_10_cm_code'}, inplace=True)

#exact match
er_rest = er_rest.merge(ccsr, left_on='entity_code', right_on='icd_10_cm_code', how='left')

In [45]:
#concatenate dataframes
er_final = pd.concat([er_for_best_match, er_rest])

In [46]:
print(er_final.shape)
print(er.shape)
print('\n')
print(er_final['url'].nunique())
print(er['url'].nunique())

(272722, 13)
(272722, 9)


74256
74256


### Find additional clinical entities to infer disease category

In [47]:
def SearchProcedures(df):

    r = {}

    terms = ['dialysis',
             'renal transplant',
             'kidney transplant',
             'lung transplant',
             'heart transplant',
             'heart surgery',
             'chemo',
             'chemotherapy',
             'radiation',
             'radiotherapy',
             'bone marrow transplant',
             'liver transplant',
             "accident","injury","injuries","injured","crash","collision","injured","burn","burned","burns"]

    for term in terms:

        #define regex
        regex = r"\b{}\b".format(term)

        #apply regex search
        search = df['fund_description'].apply(lambda x: re.search(regex, x))

        #get start and end
        start = [x.start() if x != None else np.nan for x in search]
        end = [x.end() if x != None else np.nan for x in search]

        #add to results
        r[term] = {'start':start,'end':end}
    
    return r


In [48]:
search_results = SearchProcedures(master) #takes about 30 seconds

In [49]:
def SearchToDF(search_results):

    r = []

    for k,v in search_results.items():

        new_df = pd.DataFrame({'url':master['url'].tolist(),'begin':v['start'],'end':v['end']})

        new_df = new_df.dropna()

        new_df['chunks'] = [k for x in range(len(new_df))]
        new_df['sent'] = [np.nan for x in range(len(new_df))]
        new_df['entity_code'] = ['regex_search' for x in range(len(new_df))]
        new_df['results'] = [np.nan for x in range(len(new_df))]
        new_df['resolutions'] = [np.nan for x in range(len(new_df))]
        new_df['res_distances'] = [np.nan for x in range(len(new_df))]
        new_df['icd_10_cm_code'] = ['regex_search' for x in range(len(new_df))]
        new_df['icd_10_cm_code_desc'] = [k for x in range(len(new_df))]


        if k in ['dialysis','renal transplant','kidney transplant']:
            new_df['ccsr_category'] = ['GU_from_proc' for x in range(len(new_df))]
            new_df['int_category'] = ['CKD and renal failure' for x in range(len(new_df))]
            new_df['disease_category'] = ['Genitourinary diseases' for x in range(len(new_df))]

        elif k in ['lung transplant']:
            new_df['ccsr_category'] = ['RESP_from_proc' for x in range(len(new_df))]
            new_df['int_category'] = ['Other respiratory disorders' for x in range(len(new_df))]
            new_df['disease_category'] = ['Respiratory diseases' for x in range(len(new_df))]

        elif k in ['heart transplant','heart surgery']:
            new_df['ccsr_category'] = ['CV_from_proc' for x in range(len(new_df))]
            new_df['int_category'] = ['Other cardiovascular disorders' for x in range(len(new_df))]
            new_df['disease_category'] = ['Cardiovascular diseases' for x in range(len(new_df))]

        elif k in ['chemo','chemotherapy','radiation','radiotherapy','bone marrow transplant']:
            new_df['ccsr_category'] = ['NEO_from_proc' for x in range(len(new_df))]
            new_df['int_category'] = ['Other neoplasms' for x in range(len(new_df))]
            new_df['disease_category'] = ['Neoplasms' for x in range(len(new_df))]

        elif k in ['liver transplant']:
            new_df['ccsr_category'] = ['GI_from_proc' for x in range(len(new_df))]
            new_df['int_category'] = ['Liver diseases' for x in range(len(new_df))]
            new_df['disease_category'] = ['Gastrointestinal diseases' for x in range(len(new_df))]
        
        elif k in ["accident","injury","injuries","injured","crash","collision","injured","burn","burned","burns"]:
            new_df['ccsr_category'] = ['INJ_from_proc' for x in range(len(new_df))]
            new_df['int_category'] = ['Other injuries' for x in range(len(new_df))]
            new_df['disease_category'] = ['Injuries and external causes' for x in range(len(new_df))]

        r.append(new_df)

    return pd.concat(r)

In [50]:
search_results_df = SearchToDF(search_results)

#### Among newly-added campaigns, which search terms were responsible?

In [51]:
search_results_df['url'].nunique()

34880

In [52]:
search_sub = search_results_df[~search_results_df['url'].isin(er_final['url'].unique())]

In [53]:
search_sub['url'].nunique()

1255

In [54]:
search_sub[['url','disease_category']].drop_duplicates()['disease_category'].value_counts()

Cardiovascular diseases         438
Genitourinary diseases          346
Neoplasms                       229
Gastrointestinal diseases       116
Injuries and external causes     88
Respiratory diseases             60
Name: disease_category, dtype: int64

In [55]:
search_sub[['url','disease_category']].drop_duplicates()['disease_category'].value_counts()/1255

Cardiovascular diseases         0.349004
Genitourinary diseases          0.275697
Neoplasms                       0.182470
Gastrointestinal diseases       0.092430
Injuries and external causes    0.070120
Respiratory diseases            0.047809
Name: disease_category, dtype: float64

In [56]:
er_final = pd.concat([er_final, search_results_df])

### Exclude named entities that map to certain disease categories

These categories are unable to be classified with adequate precision and recall.

In [57]:
starting_n = er_final['entity_code'].nunique()
er_final = er_final[~er_final['disease_category'].isin(['Exclude'])]
exclude_cat_n = starting_n - er_final['entity_code'].nunique()
print(exclude_cat_n)

2038


In [58]:
print(exclude_cat_n/starting_n)

0.3111450381679389


In [59]:
er_final['url'].nunique()

72860

# Evaluate classification performance

### Get list of disease categories by url

In [60]:
counts_list = er_final[['url','disease_category']].groupby('url').agg(lambda x: list(x))

In [61]:
counts_list = counts_list.reset_index()

### Format for comparison with reference data

In [62]:
def list2lofd(df):
    '''convert cat column from comma delimited to list of dictionaries with 1 or 0 for each category'''
    
    disease_categories = [
        'Cardiovascular diseases',
        'Endocrine diseases',
        'Gastrointestinal diseases',
        'Genitourinary diseases',
        'Infections',
        'Injuries and external causes',
        'Mental and substance use disorders',
        'Musculoskeletal diseases',
        'Neoplasms',
        'Nervous system diseases',
        'Respiratory diseases'
    ]
    
    r = []
    
    for index,row in df.iterrows():
        lofd = []
        as_list = row['disease_category']
        for disease in disease_categories:
            if disease in as_list:
                lofd.append({disease:1})
            else:
                lofd.append({disease:0})
        r.append(lofd)
    return r

In [63]:
counts_list['lofd'] = list2lofd(counts_list)

In [64]:
#convert to long format
long = counts_list.explode('lofd')

In [65]:
#extract key, value from dictionary
long.insert(loc=2, column='cat', value=[list(x.items())[0][0] for x in long['lofd']])
long.insert(loc=3, column='count', value=[list(x.items())[0][1] for x in long['lofd']])

In [66]:
long = long[['url','cat','count']]

In [67]:
#create pivot table
#takes a little while
pivot = long.pivot_table(index=['url'], columns='cat', values='count', aggfunc=(pd.Series.sum), fill_value=0)
pivot.columns.name = ''
pivot = pivot.reset_index()

In [68]:
pivot.head()

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Respiratory diseases
0,https://www.gofundme.com/f/-AMYSTRONG-,0,0,0,0,0,0,0,0,1,0,0
1,https://www.gofundme.com/f/-MasonStrong-,0,0,0,1,0,0,0,0,0,0,0
2,https://www.gofundme.com/f/-Shelby-Slaughter,0,0,0,0,0,0,0,0,1,0,0
3,https://www.gofundme.com/f/-help-us-help-emily,0,0,0,0,1,0,0,0,0,0,0
4,https://www.gofundme.com/f/-helpmario,0,0,0,0,0,1,0,0,0,0,0


### Prepare reference data

In [69]:
ref.rename(columns={'Genitourinary disorders':'Genitourinary diseases'}, inplace=True)
del ref['Nonspecific']
del ref['Other noncommunicable diseases']
del ref['Pregnancy and childbirth']

In [70]:
ref.head()

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Respiratory diseases
0,https://www.gofundme.com/f/1v3emdk740,0,0,0,0,0,0,0,0,0,0,0
1,https://www.gofundme.com/f/1xizs9mpo0,0,0,0,0,0,0,0,0,0,0,1
2,https://www.gofundme.com/f/1zk33nyftc,0,1,0,0,0,0,1,0,0,0,0
3,https://www.gofundme.com/f/22efh5hg,0,0,0,0,0,0,0,1,0,0,0
4,https://www.gofundme.com/f/22kcir7teo,1,0,0,1,1,0,0,1,0,0,1


In [71]:
ref.shape

(206, 12)

### Add blank rows for urls with no identified NER/ER terms

In [72]:
master_urls = master['url']

In [73]:
urls_to_add = master_urls[~master_urls.isin(pivot['url'].tolist())].tolist()

In [74]:
len(urls_to_add)

14589

In [75]:
pivot.shape[0] + len(urls_to_add) == master.shape[0]

True

In [76]:
def AddBlank(urls, df):
    r = []
    cols = df.columns[1:].values.tolist()
    for url in urls:
        line = {}
        line['url'] = url
        for col in cols:
            line[col] = 0
        r.append(line)
    return pd.DataFrame(r)

In [77]:
blank_to_add = AddBlank(urls_to_add, pivot)

In [78]:
pivot = pd.concat([pivot, blank_to_add])

In [79]:
pivot.shape[0] == master.shape[0]

True

#### Export sample of 50 campaigns that did not have any diseases identified

In [80]:
#blank_sub = master[master['url'].isin(blank_to_add['url'].sample(50))]

In [81]:
#blank_sub.to_csv('../data/disease_categorization/blank_for_review.csv', index=False)

### Create evaluation subset

In [82]:
for url in ref['url']:
    if url in master['url'].tolist():
        continue
    else:
        print(url)
print('all done')

https://www.gofundme.com/f/evp227-fernando-diaz
https://www.gofundme.com/f/help-my-mom-get-her-memories-back
https://www.gofundme.com/f/help-support-helen-iaconelli-through-her-fight
https://www.gofundme.com/f/helpbeckettkickcancersbutt
https://www.gofundme.com/f/marvin-white
https://www.gofundme.com/f/project-quotit039s-alrightquot
all done


In [83]:
sub = pivot[pivot['url'].isin(ref['url'].tolist())]

In [84]:
sub.shape

(200, 12)

In [85]:
ref[~ref['url'].isin(sub['url'])]

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Respiratory diseases
72,https://www.gofundme.com/f/evp227-fernando-diaz,0,0,0,0,0,0,0,0,1,0,0
97,https://www.gofundme.com/f/help-my-mom-get-her...,0,0,0,0,0,0,0,0,1,0,0
101,https://www.gofundme.com/f/help-support-helen-...,0,0,0,0,0,0,0,0,1,0,0
104,https://www.gofundme.com/f/helpbeckettkickcanc...,0,0,0,0,0,0,0,0,1,0,1
129,https://www.gofundme.com/f/marvin-white,0,0,0,0,0,0,0,0,1,0,0
145,https://www.gofundme.com/f/project-quotit039s-...,0,0,0,0,0,0,0,0,1,0,0


In [86]:
sub[~sub['url'].isin(ref['url'])]

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Respiratory diseases


In [87]:
ref = ref[ref['url'].isin(sub['url'])]
sub = sub[sub['url'].isin(ref['url'])]

In [88]:
ref.shape

(200, 12)

In [89]:
sub.shape

(200, 12)

In [90]:
sub = sub.sort_values('url')

In [91]:
ref = ref.sort_values('url')

In [92]:
ref = ref.reset_index()
sub = sub.reset_index()

In [93]:
ref['url'].equals(sub['url'])

True

### Compute metrics

In [94]:
def ComputeMetrics(ref, sub):
    cat = []
    precision = []
    recall = []
    f1 = []
    acc = []
    refe = []
    piv = []
    
    for col in sub.columns[2:]:
        cat.append(col)
        precision.append(round(precision_score(ref[col], sub[col]),2))
        recall.append(round(recall_score(ref[col], sub[col]),2))
        f1.append(round(f1_score(ref[col], sub[col]),2))
        acc.append(round(accuracy_score(ref[col], sub[col]),2))
        refe.append(sum(ref[col]))
        piv.append(sum(sub[col]))
    
    df = pd.DataFrame({'category':cat, 
                        'ref_count':refe,
                        'test_count':piv,
                        'precision':precision,
                        'recall':recall,
                        'f1':f1,
                        'acc':acc})
    return df

In [95]:
metrics = ComputeMetrics(ref, sub)

In [96]:
metrics

Unnamed: 0,category,ref_count,test_count,precision,recall,f1,acc
0,Cardiovascular diseases,33,28,0.89,0.76,0.82,0.94
1,Endocrine diseases,12,9,0.78,0.58,0.67,0.96
2,Gastrointestinal diseases,7,6,0.5,0.43,0.46,0.96
3,Genitourinary diseases,21,18,0.94,0.81,0.87,0.98
4,Infections,16,24,0.62,0.94,0.75,0.95
5,Injuries and external causes,31,39,0.72,0.9,0.8,0.93
6,Mental and substance use disorders,16,19,0.58,0.69,0.63,0.94
7,Musculoskeletal diseases,20,20,0.65,0.65,0.65,0.93
8,Neoplasms,74,76,0.95,0.97,0.96,0.97
9,Nervous system diseases,37,20,0.9,0.49,0.63,0.9


In [97]:
print('Weighted precision:', np.average(metrics['precision'], weights=metrics['ref_count']))
print('Weighted recall:', np.average(metrics['recall'], weights=metrics['ref_count']))
print('Weighted f1:', np.average(metrics['f1'], weights=metrics['ref_count']))
print('Weighted accuracy:', np.average(metrics['acc'], weights=metrics['ref_count']))

Weighted precision: 0.8256737588652483
Weighted recall: 0.7797872340425531
Weighted f1: 0.7896099290780142
Weighted accuracy: 0.947304964539007


# Prepare data for export

### Get primary category

Summarize by disease category and location of chunk in text

In [98]:
#er_final = er_final.reset_index()

In [99]:
%%time
er_cat = er_final[['url','begin','disease_category']].groupby(['url']).agg(list)

CPU times: user 10.3 s, sys: 46 ms, total: 10.3 s
Wall time: 10.5 s


In [100]:
from collections import Counter

In [101]:
#get counts for each category
er_cat['disease_category_count'] = er_cat['disease_category'].apply(lambda x: Counter(x).most_common())

In [102]:
#separate counts into separate column
er_cat['disease_category_count_max'] = [x[0][1] for x in er_cat['disease_category_count']]

In [103]:
#explode
exl_d = er_cat.explode('disease_category_count')

In [104]:
#separate tuples
exl_d['disease_cat'] = [x[0] for x in exl_d['disease_category_count']]
exl_d['disease_cat_val'] = [x[1] for x in exl_d['disease_category_count']]

In [105]:
#remove categories that do not have max amount
exl_d = exl_d[exl_d['disease_cat_val'] == exl_d['disease_category_count_max']]

In [106]:
#reset indices
exl_d = exl_d.reset_index()

In [107]:
#separate urls that are duplicated (those who have multiple categories equal to max count)
exl_d_dup = exl_d[exl_d['url'].duplicated(keep=False)]

#separate urls that are not duplicated
exl_d_no_dup = exl_d[~exl_d['url'].duplicated(keep=False)]

print(exl_d_dup.shape)

print(exl_d_no_dup.shape)

(15489, 7)
(66003, 7)


In [108]:
def GetPrimaryCat(df, first_col, second_col):

    r = {}

    for url in df['url']:
        sub = df[df['url'] == url]
        candidates = sub[sub['url'] == url][first_col].tolist()
        pos = sub[sub['url'] == url].drop_duplicates('url')['begin'].tolist()[0]
        cats = sub[sub['url'] == url].drop_duplicates('url')[second_col].tolist()[0]
        new_df = pd.DataFrame({'cats':cats, 'pos':pos})
        new_df = new_df[new_df['cats'].isin(candidates)]
        final_cat = new_df.groupby('cats').agg('mean').reset_index().sort_values('pos', ascending=True)['cats'].tolist()[0]

        r[url] = final_cat
    
    return r

In [109]:
%%time
primary_disease_map = GetPrimaryCat(exl_d_dup,'disease_cat','disease_category') 

CPU times: user 2min 12s, sys: 322 ms, total: 2min 12s
Wall time: 2min 13s


In [110]:
def SubsetPrimaryCat(df, subset_map, col):
    
    r = []
    
    for k,v in subset_map.items():
        sub = df[df['url'] == k]
        sub = sub[sub[col] == v]
        r.append(sub)
    
    return pd.concat(r)    

In [111]:
%%time
exl_d_dup_primary = SubsetPrimaryCat(exl_d_dup, primary_disease_map, 'disease_cat')

CPU times: user 14.7 s, sys: 107 ms, total: 14.8 s
Wall time: 15.1 s


In [112]:
#concatenate primary results from duplicated
primary_disease = pd.concat([exl_d_no_dup,exl_d_dup_primary])

In [113]:
print(primary_disease.shape)

(72860, 7)


In [114]:
#primary_category = pd.merge(primary_disease[['url','disease_cat']], primary_int[['url','int_cat']], on='url', how='left')

# Export data

In [115]:
er_final.to_sql('er_final', con=conn,if_exists='replace')

In [116]:
pivot = pivot.reset_index()
pivot.columns.name = ''
cols = pivot.columns
cols = [x.lower().replace(',','').replace(' ','_') for x in cols]
pivot.columns = cols

pivot.to_sql('pivot', con=conn, if_exists='replace')

In [117]:
# int_pivot.reset_index()
# int_pivot.columns.name = ''
# cols = int_pivot.columns
# cols = [x.lower().replace(',','').replace(' ','_') for x in cols]
# int_pivot.columns = cols

# int_pivot.to_sql('int_pivot', con=conn, if_exists='replace')

In [118]:
primary_disease = primary_disease[['url','disease_cat']]
primary_disease.to_sql('primary_disease', con=conn, if_exists='replace')

# Review 100 campaigns with no disease category identified

In [119]:
no_disease = master[~master['url'].isin(primary_disease['url'].tolist())].sample(100)

In [123]:
no_disease.to_csv('../data/disease_categorization/no_disease_found_100.csv', index=False)