### Import dependencies

In [1]:
import os
import json
import pandas as pd
import numpy as np
import sqlite3
import json
import copy
import re
import operator
import math

### Import data

In [2]:
er = pd.read_csv('Comparing algorithms/er_for_eval.csv')

In [4]:
combinations = pd.read_csv('Comparing algorithms/model_combinations.csv')

### Clean data

In [3]:
er['score'] = [1 if x is not np.nan else 0 for x in er['target_text']]

### Run model evaluation

In [5]:
def GetCombos(combinations):
    '''
    input: pandas dataframe of model combinations
    output: list of lists of strings, each sub-list specifies a different 
          combination of model components
    '''

    #define results container
    r = []

    #create list of lists
    for i in range(len(combinations)):
        r.append(combinations.iloc[i].tolist())

    #return results
    return r

In [6]:
def ModelEval(er, combinations):

    combos = GetCombos(combinations)

    input_docs = []
    ner_terms_count = []
    docs_with_no_ner = []
    ner_terms_per_doc_mean = []
    ner_terms_per_doc_sd = []
    er_confidence_mean = []
    er_distance_mean = []

    for combo in combos:
        df = er.loc[(er['ner_model'] == combo[0]) & (er['doc_reconstruction'] == combo[1]) & (er['sent_embeddings'] == combo[2]) & (er['er_model'] == combo[3]),]

        #number of input dociments
        input_docs.append(50)

        #number of NAs for NER terms
        num_na = df['target_text'].isna().sum()

        #total number of NER terms
        ner_terms_count.append(df.shape[0] - num_na)

        #NER terms by document
        ner_by_url = df[['url','score']].groupby('url').agg('sum')
        docs_with_no_ner.append(pd.Series(ner_by_url['score'] == 0).sum())
        ner_terms_per_doc_mean.append(ner_by_url['score'].mean())
        ner_terms_per_doc_sd.append(ner_by_url['score'].std())

        #ER metrics
        er_confidence_mean.append(df['confidence'].astype(float).mean())
        er_distance_mean.append(df['distance'].astype(float).mean())

    r = pd.DataFrame({'input_docs':input_docs,
                    'ner_terms_count':ner_terms_count,
                    'docs_with_no_ner':docs_with_no_ner,
                    'ner_terms_per_doc_mean':ner_terms_per_doc_mean,
                    'ner_terms_per_doc_sd':ner_terms_per_doc_sd,
                    'er_confidence_mean':er_confidence_mean,
                    'er_distance_mean':er_distance_mean})
    return pd.concat([combinations, r], axis=1)

In [7]:
ModelEval(er, combinations)

Unnamed: 0,ner_model,doc_reconstruction,embeddings,entity_resolution_model,input_docs,ner_terms_count,docs_with_no_ner,ner_terms_per_doc_mean,ner_terms_per_doc_sd,er_confidence_mean,er_distance_mean
0,ner_clinical,none,none,chunkresolve_icd10cm_clinical,50,437,3,8.74,10.86618,0.062511,1.01556
1,ner_clinical,concat_and,none,chunkresolve_icd10cm_clinical,50,413,3,8.26,10.538191,0.065614,0.963053
2,ner_clinical,sent_with_ner,none,chunkresolve_icd10cm_clinical,50,923,3,18.46,31.883212,0.063406,0.972786
3,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,437,3,8.74,10.86618,0.375657,6.381022
4,ner_clinical,concat_and,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,413,3,8.26,10.538191,0.436572,5.756073
5,ner_clinical,sent_with_ner,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,923,3,18.46,31.883212,0.404267,6.015442
6,ner_jsl,none,none,chunkresolve_icd10cm_clinical,50,113,16,2.26,3.983691,0.066368,0.580966
7,ner_jsl,concat_and,none,chunkresolve_icd10cm_clinical,50,103,16,2.06,3.924855,0.066749,0.566989
8,ner_jsl,sent_with_ner,none,chunkresolve_icd10cm_clinical,50,204,16,4.08,10.703499,0.066055,0.596892
9,ner_jsl,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,50,113,16,2.26,3.983691,0.724112,3.072292


### Which campaigns have >2x NER terms with sent_with_ner

In [21]:
df = er.loc[(er['ner_model'] == 'ner_clinical') & (er['doc_reconstruction'] == 'none') & (er['sent_embeddings'] == 'sbiobert_base_cased_mli') & (er['er_model'] == 'sbiobertresolve_icd10cm_augmented'),]




In [22]:
df[['url','score']].groupby('url').agg('sum')

Unnamed: 0_level_0,score
url,Unnamed: 1_level_1
https://www.gofundme.com/f/22iq896m5c,0
https://www.gofundme.com/f/2bkkgaq4,6
https://www.gofundme.com/f/2hp65afg,10
https://www.gofundme.com/f/2phb83rg,2
https://www.gofundme.com/f/3s01f0o,3
https://www.gofundme.com/f/5cv535-cancer-treatment-help,7
https://www.gofundme.com/f/5q5oals,4
https://www.gofundme.com/f/FergusonK9Health,2
https://www.gofundme.com/f/Jacobseye,5
https://www.gofundme.com/f/JosiahAnakinLopez,0


In [25]:
df[df['url'] == 'https://www.gofundme.com/f/2bkkgaq4']

Unnamed: 0,url,ner_model,doc_reconstruction,sent_embeddings,er_model,target_text,sentence,start_char,end_char,resolved_text,entity_code,confidence,distance,score
1855,https://www.gofundme.com/f/2bkkgaq4,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,infantile Scoliosis,0.0,152.0,170.0,"spondylodysplasia, torrance type (disorder)",Q788,0.1253,8.8018,1
1856,https://www.gofundme.com/f/2bkkgaq4,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,a recent growth spurt,1.0,238.0,258.0,recent weight gain,R635,0.7366,8.6517,1
1857,https://www.gofundme.com/f/2bkkgaq4,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,derasticaly more serious,2.0,279.0,302.0,excessive smegma (finding),N488,0.0861,9.1094,1
1858,https://www.gofundme.com/f/2bkkgaq4,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,the curvature,3.0,308.0,320.0,curvature of spine,M4399,0.2996,7.3941,1
1859,https://www.gofundme.com/f/2bkkgaq4,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,extreme,4.0,333.0,339.0,exuberance,R468,0.0813,8.2304,1
1860,https://www.gofundme.com/f/2bkkgaq4,ner_clinical,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,loose faith,5.0,1578.0,1588.0,negative attitude,R4589,0.1606,5.7164,1


In [77]:
df2 = er.loc[(er['ner_model'] == 'ner_diseases') & (er['doc_reconstruction'] == 'none') & (er['sent_embeddings'] == 'sbiobert_base_cased_mli') & (er['er_model'] == 'sbiobertresolve_icd10cm_augmented'),]



In [78]:
df2[['url','score']].groupby('url').agg('sum')

Unnamed: 0_level_0,score
url,Unnamed: 1_level_1
https://www.gofundme.com/f/22iq896m5c,0
https://www.gofundme.com/f/2bkkgaq4,3
https://www.gofundme.com/f/2hp65afg,7
https://www.gofundme.com/f/2phb83rg,1
https://www.gofundme.com/f/3s01f0o,3
https://www.gofundme.com/f/5cv535-cancer-treatment-help,8
https://www.gofundme.com/f/5q5oals,3
https://www.gofundme.com/f/FergusonK9Health,1
https://www.gofundme.com/f/Jacobseye,2
https://www.gofundme.com/f/JosiahAnakinLopez,0


In [79]:
df2[df2['url'] == 'https://www.gofundme.com/f/3s01f0o']

Unnamed: 0,url,ner_model,doc_reconstruction,sent_embeddings,er_model,target_text,sentence,start_char,end_char,resolved_text,entity_code,confidence,distance,score
5928,https://www.gofundme.com/f/3s01f0o,ner_diseases,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,a swollen arm,0.0,240.0,252.0,swollen legs,M7989,0.2274,5.7004,1
5929,https://www.gofundme.com/f/3s01f0o,ner_diseases,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,a blood clot near his heart,1.0,283.0,309.0,exposure to blood and/or body fluid,X58,0.1235,10.5322,1
5930,https://www.gofundme.com/f/3s01f0o,ner_diseases,none,sbiobert_base_cased_mli,sbiobertresolve_icd10cm_augmented,the blood clot,2.0,470.0,483.0,blood clot in eye (finding),H578,0.2178,8.0489,1


# Conclusions
- I coded something wrong with the "sent_with_ner" document reconstruction, so there are not actually >2x the amount of NER terms
- However, document reconstructions do not seem to meaningfully impact finding NER terms or entity resolution, so we will use the original.
- "ner_clinical" identifies too many superfluous terms.
- "ner_diseases" seems to pick up important information with minimal superfluous terms
- biobert resolution will work the best


### Final approach
- NER: ner_diseases
- no document reconstruction
- sbiobert embeddings and entity resolution