This is the function we use to add a flag column to a bucket of examples:

```
CREATE OR REPLACE FUNCTION add_qualifier_flag_column_to_bucket(
    my_bucket integer,
    my_qualifier_id text
) RETURNS table (
    bucket integer,
    pmid text,
    paragraph_number integer,
    vector vector,
    flag integer
)
    AS 'with 
q as (
    select * from qualifier where qualifier_id=my_qualifier_id
)
,
be as (
    select pb.bucket, e.pmid, e.paragraph_number, e.vector 
        from embedding e join pmid_bucket pb on e.pmid = pb.pmid where pb.bucket in (my_bucket)
        order by bucket, pmid, paragraph_number
)
select be.bucket, be.pmid, be.paragraph_number, be.vector, case when q.qualifier_id is null then 0 else 1 end as flag
    from be left join q on be.pmid = q.pmid;'
    LANGUAGE SQL
    STABLE
    RETURNS NULL ON NULL INPUT
;

-- select * from add_qualifier_flag_column_to_bucket(2, 'Q000401');

```

In [1]:
import os
import regex
import time
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from credentials import pmc_credentials # a dict defined in a python file in the current directory

connection_string = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}'.format(**pmc_credentials)

engine = create_engine(connection_string)

# pd.read_sql("select * from paragraph order by pmid, paragraph_number limit 5", con=engine)

In [2]:
qualifier_detail = pd.read_sql("select * from qualifier_detail", con=engine)
qualifier_detail

Unnamed: 0,id,name
0,Q000378,metabolism
1,Q000201,enzymology
2,Q000648,ultrastructure
3,Q000235,genetics
4,Q000175,diagnosis
...,...,...
71,Q000493,pharmacokinetics
72,Q000737,chemistry
73,Q000821,virology
74,Q000941,ethics


In [3]:
from sklearn.linear_model import LogisticRegressionCV

get_mean_xval_score_for_binary_classifier = lambda clf: np.mean([np.max(v) for v in clf.scores_[True]])
# !!! should take the scores from the final C value, not the best of each one.
# We want the highest mean xval score across the folds, not the mean of the highest scores acrtoss the Cs.

qualifier_models = {}
qualifier_id_to_name = {}

for row in qualifier_detail.to_dict(orient='records'):
    print(row['name'], end =" ")
    if row['id'] in qualifier_models.keys():
        print("(model already trained)")
    else:
        try:
            qualifier_id_to_name[row['id']] = row['name']
            sql=f"select * from add_qualifier_flag_column_to_bucket(0, '{row['id']}');"
            df = pd.read_sql(sql, con=engine)
            df['vector'] = [eval(v) for v in df['vector']]
        
            X_train = df['vector'].tolist()
            y_train = df['flag'].values
            
            clf = LogisticRegressionCV(cv=5, scoring='roc_auc', n_jobs=-1, max_iter=10000)
            clf.fit(X_train, y_train)
        
            qualifier_models[ row['id'] ] = clf;
        
            print( round(get_mean_xval_score_for_binary_classifier(clf), 5) )
        except Exception as error:
            print("An error occurred:", type(error).__name__, error)


metabolism 0.87417
enzymology 0.8656
ultrastructure 0.85712
genetics 0.86288
diagnosis 0.83806
therapeutic use 0.87682
drug therapy 0.81216
mortality 0.92091
trends 0.88658
immunology 0.91834
complications 0.81813
therapy 0.83331
prevention & control 0.81466
etiology 0.76365
cytology 0.89292
drug effects 0.85667
analysis 0.7567
physiology 0.75609
surgery 0.91091
injuries 0.93465
administration & dosage 0.80721
adverse effects 0.79007
classification 0.88234
blood 0.88607
methods 0.68645
standards 0.85754
pharmacology 0.87366
deficiency 0.87824
analogs & derivatives 0.79792
pathology 0.8336
toxicity 0.88093
anatomy & histology 0.83472
innervation 0.97842
microbiology 0.91595
embryology 0.94042
abnormalities 0.94034
chemically induced 0.90108
congenital 0.99499
blood supply 0.92165
physiopathology 0.80808
diagnostic imaging 0.88237
psychology 0.92508
growth & development 0.89528
instrumentation 0.87041
antagonists & inhibitors 0.86915
isolation & purification 0.86705
transplantation 0.927



0.98775
veterinary 0.94739
education 0.95111
rehabilitation 0.99834
ethnology 0.95759
organization & administration 0.9411
supply & distribution 0.999
cerebrospinal fluid 0.97406
pathogenicity 0.90354
secondary 0.97333
parasitology 



0.9476
economics 0.92383
legislation & jurisprudence 0.97892
statistics & numerical data 0.85847
radiotherapy 0.96751
radiation effects 0.91986
nursing 0.99767
history 0.98867
diet therapy 0.99561
pharmacokinetics 0.85832
chemistry 0.86392
virology 0.93523
ethics 0.97757
agonists 0.95866


In [None]:
# qualifier_detail[qualifier_detail['name']=='urine']  # Q000652 	urine, Q000506 	poisoning
# 'Q000506' in qualifier_models.keys()

In [12]:
import pickle

models_file = r'qualifier_models.pickle'
with open(models_file, 'wb') as pfh:
    pickle.dump(qualifier_models, pfh, pickle.HIGHEST_PROTOCOL)

In [14]:
qualifier_names_file = 'qualifier_names.pickle'

with open(qualifier_names_file, 'wb') as pfh:
    pickle.dump(qualifier_id_to_name, pfh, pickle.HIGHEST_PROTOCOL)