### Import packages

In [118]:
import pandas as pd
import numpy as np

### Import utilities

In [2]:
import sys
sys.path.append('../src/')
import utils

z_recode_stems = utils.z_recode_stems
Recode_z = utils.Recode_z
most_common = utils.most_common

### Import data

In [119]:
ccsr = pd.read_csv('../data/ccsr/disease_hierarchy.csv')

In [120]:
er = pd.read_csv('../data/disease_categorization/evaluation/er_3-6_more_models.csv')

In [121]:
er = er[(er['ner_model'] == 'ner_jsl') & (er['er_model'] == 'sbiobertresolve_icd10cm_augmented')]

In [122]:
er = pd.read_csv('../data/eval_clean_text_er_nosymp.csv')

### Inspect data

In [123]:
ccsr.head()

Unnamed: 0,icd_10_cm_code,icd_10_cm_code_desc,ccsr_category,disease_category
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",Intestinal infection,Gastrointestinal diseases
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",Intestinal infection,Gastrointestinal diseases
2,A009,"Cholera, unspecified",Intestinal infection,Gastrointestinal diseases
3,A0100,"Typhoid fever, unspecified",Intestinal infection,Gastrointestinal diseases
4,A0101,Typhoid meningitis,Meningitis,Nervous system diseases


In [124]:
er.head()

Unnamed: 0,url,chunks,begin,end,sent,code,results,resolutions,res_distances
0,https://www.gofundme.com/f/please-help-karens-...,mental illness,154.0,167.0,0,F99,F99:::F989:::Z818:::F069:::F489:::F818:::R4182...,mental illness:::mental disease:::fh - mental ...,0.0000:::4.2922:::6.1046:::6.4576:::6.6951:::6...
1,https://www.gofundme.com/f/please-help-karens-...,infection,350.0,358.0,2,P399,P399:::B99:::P369:::B999:::H6039:::B889:::K137...,infection:::infectious disease:::clinical infe...,0.0000:::7.7580:::8.0403:::8.0569:::8.2141:::8...
2,https://www.gofundme.com/f/please-help-karens-...,food allergies,372.0,385.0,2,T781,T781:::T781X:::Z9101:::Z9102:::Z910:::K522:::Z...,food allergy:::allergy to food:::food allergy ...,2.6654:::4.4447:::4.5697:::5.2001:::5.5752:::5...
3,https://www.gofundme.com/f/please-help-karens-...,mental illness,412.0,425.0,2,F99,F99:::F989:::Z818:::F069:::F489:::F818:::R4182...,mental illness:::mental disease:::fh - mental ...,0.0000:::4.2922:::6.1046:::6.4576:::6.6951:::6...
4,https://www.gofundme.com/f/please-help-karens-...,infection,566.0,574.0,3,P399,P399:::B99:::P369:::B999:::H6039:::B889:::K137...,infection:::infectious disease:::clinical infe...,0.0000:::7.7580:::8.0403:::8.0569:::8.2141:::8...


In [125]:
er['url'].nunique()

171

In [13]:
# er_urls_input = er['url'].tolist()

In [14]:
# er.dropna(axis = 0, how = 'any', inplace = True)

In [15]:
# er['url'].nunique()

148

In [126]:
er.rename(columns={'code':'entity_code'}, inplace=True)

### Recode "family history of" codes (start with Z8) and recode to second code in results

In [14]:
#takes a little while
recode_fh_map = Recode_z(er, z_recode_stems)

In [15]:
#find the most common remapping and use that
recode_fh_map_lofd = pd.DataFrame(recode_fh_map).groupby('code').agg(lambda x: most_common(list(x))).reset_index().to_dict(orient='records')

recode_fh_map_dict = {}
for d in recode_fh_map_lofd:
    recode_fh_map_dict[d['code']] = d['res']

In [16]:
#replace values
er['entity_code'].replace(recode_fh_map_dict, inplace=True)

### Match ICD codes from entity resolution with categories from CCSR

In [133]:
raw_codes = er['entity_code'].unique().tolist()

In [134]:
ccsr_codes = ccsr['icd_10_cm_code'].tolist()

In [135]:
def StemCode(code, ccsr):
    res = 0
    while res == 0:
        sub = ccsr[ccsr['icd_10_cm_code'].str.startswith(code)]
        res = sub.shape[0]
        if res > 0:
            cats = sub['ccsr_category'].value_counts().index[0]
            return code, cats
        else:
            code = code[:-1]
            if len(code) == 0:
                print('error')
                return np.nan, np.nan

In [136]:
def GetCodeMatch(raw_codes, ccsr_codes, ccsr):

    #define results container for best match
    best_match = []

    #loop through raw codes
    for raw in raw_codes:
        #if raw code exists in CCSR as-is
        if raw in ccsr_codes:
            #no need to define best match as exact match already exists
            continue
        else:
            #see if there are any codes that start with code
            #if not, remove final character, try again
            #once codes are found, select the ccsr_cateogry that is most common
            matched_code, ccsr_cat = StemCode(raw, ccsr)
            best_match.append({'raw_code':raw, 'matched_code_stem':matched_code,'matched_ccsr_category':ccsr_cat})

    return best_match

In [137]:
best_match_map = GetCodeMatch(raw_codes, ccsr_codes, ccsr)

In [138]:
best_match_map = pd.DataFrame(best_match_map).merge(ccsr[['ccsr_category','int_category','disease_category']].drop_duplicates('ccsr_category'), 
                                                    left_on='matched_ccsr_category', 
                                                    right_on='ccsr_category', 
                                                    how='left')

In [139]:
best_match_map.sample(5).head()

Unnamed: 0,raw_code,matched_code_stem,matched_ccsr_category,ccsr_category,disease_category
34,C950,C950,Leukemia - all other types,Leukemia - all other types,Neoplasms
39,S3723,S3723,"Internal organ injury, initial encounter","Internal organ injury, initial encounter",Injuries and external causes
73,I70,I70,Peripheral and visceral vascular disease,Peripheral and visceral vascular disease,Cardiovascular diseases
25,R688,R688,Other general signs and symptoms,Other general signs and symptoms,Nonspecific
23,V892,V892,Unacceptable PDX,Unacceptable PDX,Nonspecific


### Replace entity codes

In [140]:
#subset dataframe for those that need best matched data
er_for_best_match = er[er['entity_code'].isin(best_match_map['raw_code'].tolist())]
er_rest = er[~er['entity_code'].isin(er_for_best_match['entity_code'].tolist())]

In [141]:
#merge ER data with appropriate disease categories

#best match
er_for_best_match = er_for_best_match.merge(best_match_map, left_on='entity_code', right_on='raw_code', how='left')
del er_for_best_match['matched_code_stem']
del er_for_best_match['matched_ccsr_category']
er_for_best_match.loc[:,'icd_10_cm_code_desc'] = ['best_match' for x in range(len(er_for_best_match))]
er_for_best_match.rename(columns={'raw_code':'icd_10_cm_code'}, inplace=True)

#exact match
er_rest = er_rest.merge(ccsr, left_on='entity_code', right_on='icd_10_cm_code', how='left')

In [142]:
#concatenate dataframes
er_final = pd.concat([er_for_best_match, er_rest])

In [143]:
print(er_final.shape)
print(er.shape)
print('\n')
print(er_final['url'].nunique())
print(er['url'].nunique())

(664, 13)
(664, 9)


171
171


### Get disease categories by url

In [144]:
counts_list = er_final[['url','disease_category']].groupby('url').agg(lambda x: list(x))

In [145]:
counts_list = counts_list.reset_index()

~~~Export data with broader categories for comparison

In [30]:
dh = pd.read_excel('../not_currently_using/disease_hierarchy/disease_hierarchy.xlsx')

In [33]:
er_dh = er_final.merge(dh, left_on='ccsr_category', right_on='Level 3', how='left')

In [37]:
er_dh = er_dh.drop_duplicates('target_text')

In [38]:
er_dh.to_csv('../data/er_dh.csv', index=False)

### Format data for comparison with reference set

In [146]:
def list2lofd(df):
    '''convert cat column from comma delimited to list of dictionaries with 1 or 0 for each category'''
    
    disease_categories = [
        'Cardiovascular diseases',
        'Endocrine diseases',
        'Gastrointestinal diseases',
        'Genitourinary diseases',
        'Infections',
        'Injuries and external causes',
        'Mental and substance use disorders',
        'Musculoskeletal diseases',
        'Neoplasms',
        'Nervous system diseases',
        'Nonspecific',
        'Other noncommunicable diseases',
        'Pregnancy and childbirth',
        'Respiratory diseases'
    ]
    
    r = []
    
    for index,row in df.iterrows():
        lofd = []
        as_list = row['disease_category']
        for disease in disease_categories:
            if disease in as_list:
                lofd.append({disease:1})
            else:
                lofd.append({disease:0})
        r.append(lofd)
    return r

In [147]:
counts_list['lofd'] = list2lofd(counts_list)

In [148]:
counts_list.head()

Unnamed: 0,url,disease_category,lofd
0,https://www.gofundme.com/f/1zk33nyftc,"[Nonspecific, Nonspecific, Mental, behavioral,...","[{'Cardiovascular diseases': 0}, {'Endocrine d..."
1,https://www.gofundme.com/f/22efh5hg,"[Other noncommunicable diseases, Musculoskelet...","[{'Cardiovascular diseases': 0}, {'Endocrine d..."
2,https://www.gofundme.com/f/22kcir7teo,"[Nonspecific, Cardiovascular diseases, Nonspec...","[{'Cardiovascular diseases': 1}, {'Endocrine d..."
3,https://www.gofundme.com/f/24yze8hw,"[Nonspecific, Neoplasms]","[{'Cardiovascular diseases': 0}, {'Endocrine d..."
4,https://www.gofundme.com/f/279cd9g,"[Neoplasms, Neoplasms]","[{'Cardiovascular diseases': 0}, {'Endocrine d..."


In [149]:
#convert to long format
long = counts_list.explode('lofd')

In [150]:
#extract key, value from dictionary
long.insert(loc=2, column='cat', value=[list(x.items())[0][0] for x in long['lofd']])
long.insert(loc=3, column='count', value=[list(x.items())[0][1] for x in long['lofd']])

In [151]:
long = long[['url','cat','count']]

In [152]:
long = long.replace('Mental, behavioral, and substance use disorders', 'Mental and substance use disorders')

In [153]:
#create pivot tables
pivot = long.pivot_table(index=['url'], columns='cat', values='count', aggfunc=(pd.Series.sum), fill_value=0)

In [154]:
pivot.columns.name = ''
pivot = pivot.reset_index()

In [155]:
pivot.head()

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Nonspecific,Other noncommunicable diseases,Pregnancy and childbirth,Respiratory diseases
0,https://www.gofundme.com/f/1zk33nyftc,0,0,0,0,0,0,1,0,0,0,1,0,0,0
1,https://www.gofundme.com/f/22efh5hg,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2,https://www.gofundme.com/f/22kcir7teo,1,0,1,1,1,0,0,0,0,0,1,0,0,0
3,https://www.gofundme.com/f/24yze8hw,0,0,0,0,0,0,0,0,1,0,1,0,0,0
4,https://www.gofundme.com/f/279cd9g,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Import reference data

In [185]:
ref = pd.read_csv('../data/disease_categorization/evaluation/reference_set_any_pivot.csv')

In [186]:
ref.rename(columns={'Genitourinary disorders':'Genitourinary diseases'}, inplace=True)

In [187]:
ref.head()

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Nonspecific,Other noncommunicable diseases,Pregnancy and childbirth,Respiratory diseases
0,https://www.gofundme.com/f/1v3emdk740,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,https://www.gofundme.com/f/1xizs9mpo0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,https://www.gofundme.com/f/1zk33nyftc,0,1,0,0,0,0,1,0,0,0,0,0,0,0
3,https://www.gofundme.com/f/22efh5hg,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,https://www.gofundme.com/f/22kcir7teo,1,0,0,1,1,0,0,1,0,0,0,0,0,1


In [188]:
ref_urls = ref['url']

In [189]:
ref.shape

(200, 15)

### Add blank rows for urls with no identified NER/ER terms

In [190]:
urls_to_add = pd.Series(ref_urls)[~pd.Series(ref_urls).isin(pivot['url'].tolist())].tolist()

KeyError: 'url'

In [163]:
r = []
cols = pivot.columns[1:].values.tolist()
for url in urls_to_add:
    line = {}
    line['url'] = url
    for col in cols:
        line[col] = 0
    r.append(line)

In [164]:
df_to_add = pd.DataFrame(r)

In [165]:
pivot = pd.concat([pivot, df_to_add])

In [166]:
pivot = pivot.sort_values('url')

### Check that ER and reference urls are aligned

In [167]:
ref['url'].equals(pivot['url'])

False

In [168]:
ref[~ref['url'].isin(pivot['url'])]

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Nonspecific,Other noncommunicable diseases,Pregnancy and childbirth,Respiratory diseases


In [169]:
pivot[~pivot['url'].isin(ref['url'])]

Unnamed: 0,url,Cardiovascular diseases,Endocrine diseases,Gastrointestinal diseases,Genitourinary diseases,Infections,Injuries and external causes,Mental and substance use disorders,Musculoskeletal diseases,Neoplasms,Nervous system diseases,Nonspecific,Other noncommunicable diseases,Pregnancy and childbirth,Respiratory diseases
63,https://www.gofundme.com/f/esperanza-para-mi-v...,0,0,0,0,0,0,0,0,1,0,0,1,0,0


In [170]:
ref = ref[ref['url'].isin(pivot['url'])]
pivot = pivot[pivot['url'].isin(ref['url'])]

In [171]:
ref.shape

(200, 15)

In [172]:
pivot.shape

(200, 15)

In [173]:
ref = ref.reset_index()
pivot = pivot.reset_index()

In [174]:
ref['url'].equals(pivot['url'])

True

In [747]:
more_cats = pd.read_csv('../data/more_cats.csv')

In [748]:
more_cats.head()

Unnamed: 0,url,gu,resp,cv,onc,msk
0,https://www.gofundme.com/f/dawns-breast-cancer...,0,0,0,1,0
1,https://www.gofundme.com/f/dqgsnc24,0,1,0,0,0
2,https://www.gofundme.com/f/NathanaelRodriguezR...,1,0,0,0,0
3,https://www.gofundme.com/f/dxv7w-helping-with-...,0,1,0,0,0
4,https://www.gofundme.com/f/be-one-lolo-rey039s...,1,0,0,0,0


In [749]:
urls = []
gus = []
resps = []
cvs = []
oncs = []
msks = []



for index, row in pivot.iterrows():
    url = row['url']
    if url in more_cats['url'].tolist():
        gu = row['Genitourinary diseases']
        resp = row['Respiratory diseases']
        cv = row['Cardiovascular diseases']
        onc = row['Neoplasms']
        msk = row['Musculoskeletal diseases']
        
        sub = more_cats[more_cats['url'] == url]
        
        urls.append(url)
        
        if sub['gu'].tolist()[0] != 0:
            if gu == 0:
                gus.append(1)
            else:
                gus.append(0)
        else:
            gus.append(0)            
            
        if sub['resp'].tolist()[0] != 0:
            if resp == 0:
                resps.append(1)
            else:
                resps.append(0)
        else:
            resps.append(0)   
        
        if sub['cv'].tolist()[0] != 0:
            if cv == 0:
                cvs.append(1)
            else:
                cvs.append(0)
        else:
            cvs.append(0)     
            
        if sub['onc'].tolist()[0] != 0:
            if onc == 0:
                oncs.append(1)
            else:
                oncs.append(0)
        else:
            oncs.append(0)   
        
        if sub['msk'].tolist()[0] != 0:
            if msk == 0:
                msks.append(1)
            else:
                msks.append(0)
        else:
            msks.append(0)    
            
            

In [750]:
to_add = pd.DataFrame({'url':urls,'gu':gus,'resp':resps,'cv':cvs,'onc':oncs,'msk':msks})

In [751]:
to_add

Unnamed: 0,url,gu,resp,cv,onc,msk
0,https://www.gofundme.com/f/Mizmig,1,0,0,0,0
1,https://www.gofundme.com/f/TeamDanica-Monica,0,0,0,1,0
2,https://www.gofundme.com/f/WestonGernentz,0,0,0,0,0
3,https://www.gofundme.com/f/bailey-finch-medica...,0,0,1,0,0
4,https://www.gofundme.com/f/barry039s-medical-fees,1,0,0,0,0
5,https://www.gofundme.com/f/evp227-fernando-diaz,0,0,0,0,0
6,https://www.gofundme.com/f/javier-montenegros-...,1,0,0,0,0
7,https://www.gofundme.com/f/s7wc7c,1,0,0,0,0
8,https://www.gofundme.com/f/taylor-raye-heart-w...,0,0,0,0,0


In [752]:
pivot.loc[pivot['url'] == to_add['url'][0],'Genitourinary diseases'] = 1
pivot.loc[pivot['url'] == to_add['url'][1],'Neoplasms'] = 1
pivot.loc[pivot['url'] == to_add['url'][3],'Cardiovascular diseases'] = 1
pivot.loc[pivot['url'] == to_add['url'][4],'Genitourinary diseases'] = 1
pivot.loc[pivot['url'] == to_add['url'][6],'Genitourinary diseases'] = 1
pivot.loc[pivot['url'] == to_add['url'][7],'Genitourinary diseases'] = 1

In [175]:
ref_urls = ref['url']

del ref['url']
del pivot['url']

In [176]:
del ref['index']
del pivot['index']

### Compute metrics

In [191]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report

In [1]:
er_dh = pd.read_csv('../data/er_dh.csv')

NameError: name 'pd' is not defined

In [200]:
y_true = er_dh['level_2_eval'].tolist()
y_pred = er_dh['Level 2'].tolist()

In [201]:
print(classification_report(y_true, y_pred))

                                                                 precision    recall  f1-score   support

                            Abnormal findings without diagnosis       0.00      0.00      0.00         0
                                                  Breast cancer       1.00      0.67      0.80         6
                                                           COPD       1.00      1.00      1.00         3
                   Cardiac and circulatory congenital anomalies       1.00      1.00      1.00         2
                                      Chromosomal abnormalities       0.75      1.00      0.86         3
                                         Chronic kidney disease       1.00      1.00      1.00         1
                                 Circulatory signs and symptoms       0.00      0.00      0.00         0
                                        Coronary artery disease       1.00      1.00      1.00         1
                                                      

In [177]:
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

In [178]:
cat = []
precision = []
recall = []
f1 = []
acc = []
refe = []
piv = []
for col in pivot.columns:
    cat.append(col)
    precision.append(round(precision_score(ref[col], pivot[col]),2))
    recall.append(round(recall_score(ref[col], pivot[col]),2))
    f1.append(round(f1_score(ref[col], pivot[col]),2))
    acc.append(round(accuracy_score(ref[col], pivot[col]),2))
    refe.append(sum(ref[col]))
    piv.append(sum(pivot[col]))

  _warn_prf(average, modifier, msg_start, len(result))


In [179]:
df = pd.DataFrame({'category':cat, 
                   'ref_count':refe,
                   'test_count':piv,
              'precision':precision,
              'recall':recall,
             'f1':f1,
            'acc':acc})

In [435]:
df

Unnamed: 0,category,ref_count,test_count,precision,recall,f1,acc
0,Cardiovascular diseases,31,17,0.94,0.52,0.67,0.92
1,Endocrine diseases,12,5,1.0,0.42,0.59,0.96
2,Gastrointestinal diseases,9,4,0.5,0.22,0.31,0.95
3,Genitourinary diseases,21,13,1.0,0.62,0.76,0.96
4,Infections,16,15,0.73,0.69,0.71,0.95
5,Injuries and external causes,29,10,1.0,0.34,0.51,0.9
6,Mental and substance use disorders,17,13,0.85,0.65,0.73,0.96
7,Musculoskeletal diseases,19,12,1.0,0.63,0.77,0.96
8,Neoplasms,79,77,0.96,0.94,0.95,0.96
9,Nervous system diseases,36,18,0.83,0.42,0.56,0.88


In [180]:
#remove categories that will not be included in further analysis
df = df[~df['category'].isin(['Nonspecific','Pregnancy and childbirth','Other noncommunicable diseases'])]

In [666]:
print('Weighted precision:', np.average(df['precision'], weights=df['ref_count']))
print('Weighted recall:', np.average(df['recall'], weights=df['ref_count']))
print('Weighted f1:', np.average(df['f1'], weights=df['ref_count']))
print('Weighted accuracy:', np.average(df['acc'], weights=df['ref_count']))

Weighted precision: 0.9081754385964912
Weighted recall: 0.6071228070175438
Weighted f1: 0.7077894736842105
Weighted accuracy: 0.936561403508772


In [309]:
print('Weighted precision:', np.average(df['precision'], weights=df['ref_count']))
print('Weighted recall:', np.average(df['recall'], weights=df['ref_count']))
print('Weighted f1:', np.average(df['f1'], weights=df['ref_count']))
print('Weighted accuracy:', np.average(df['acc'], weights=df['ref_count']))

Weighted precision: 0.9237894736842104
Weighted recall: 0.6110526315789473
Weighted f1: 0.7123859649122807
Weighted accuracy: 0.936561403508772


In [437]:
print('Weighted precision:', np.average(df['precision'], weights=df['ref_count']))
print('Weighted recall:', np.average(df['recall'], weights=df['ref_count']))
print('Weighted f1:', np.average(df['f1'], weights=df['ref_count']))
print('Weighted accuracy:', np.average(df['acc'], weights=df['ref_count']))

Weighted precision: 0.9114736842105262
Weighted recall: 0.6294035087719297
Weighted f1: 0.7240701754385963
Weighted accuracy: 0.9385614035087719


In [647]:
print('Weighted precision:', np.average(df['precision'], weights=df['ref_count']))
print('Weighted recall:', np.average(df['recall'], weights=df['ref_count']))
print('Weighted f1:', np.average(df['f1'], weights=df['ref_count']))
print('Weighted accuracy:', np.average(df['acc'], weights=df['ref_count']))

Weighted precision: 0.8176842105263159
Weighted recall: 0.7190877192982457
Weighted f1: 0.756140350877193
Weighted accuracy: 0.94


In [758]:
print('Weighted precision:', np.average(df['precision'], weights=df['ref_count']))
print('Weighted recall:', np.average(df['recall'], weights=df['ref_count']))
print('Weighted f1:', np.average(df['f1'], weights=df['ref_count']))
print('Weighted accuracy:', np.average(df['acc'], weights=df['ref_count']))

Weighted precision: 0.8171228070175439
Weighted recall: 0.7363508771929825
Weighted f1: 0.7673333333333333
Weighted accuracy: 0.9414736842105262


In [759]:
df

Unnamed: 0,category,ref_count,test_count,precision,recall,f1,acc
0,Cardiovascular diseases,31,26,0.88,0.74,0.81,0.94
1,Endocrine diseases,12,9,0.78,0.58,0.67,0.96
2,Gastrointestinal diseases,9,7,0.43,0.33,0.38,0.95
3,Genitourinary diseases,21,14,0.93,0.62,0.74,0.95
4,Infections,16,23,0.65,0.94,0.77,0.95
5,Injuries and external causes,29,30,0.67,0.69,0.68,0.9
6,Mental and substance use disorders,17,18,0.56,0.59,0.57,0.92
7,Musculoskeletal diseases,19,19,0.63,0.63,0.63,0.93
8,Neoplasms,79,79,0.96,0.96,0.96,0.97
9,Nervous system diseases,36,22,0.86,0.53,0.66,0.9


In [181]:
print('Weighted precision:', np.average(df['precision'], weights=df['ref_count']))
print('Weighted recall:', np.average(df['recall'], weights=df['ref_count']))
print('Weighted f1:', np.average(df['f1'], weights=df['ref_count']))
print('Weighted accuracy:', np.average(df['acc'], weights=df['ref_count']))

Weighted precision: 0.8193684210526316
Weighted recall: 0.732280701754386
Weighted f1: 0.7629473684210526
Weighted accuracy: 0.9454736842105262
