In [1]:
import os
import regex
import time
import pandas as pd
import numpy as np
import pickle
import psycopg2
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegressionCV

from credentials import pmc_credentials # a dict defined in a python file in the current directory

connection_string = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}'.format(**pmc_credentials)

engine = create_engine(connection_string)


def get_mean_xval_score_for_binary_classifier(clf):
    """ The highest mean xval score across the cross-validation folds """
    r = np.where(clf.Cs_ == clf.C_)[0][0]
    return np.mean([v[r] for v in clf.scores_['1']])


In [2]:
descriptor_counts = pd.read_sql("select * from pmc_descriptor_counts where tally > 2000", con=engine)
descriptor_counts

Unnamed: 0,descriptor_id,name,tally
0,D006801,Humans,869145
1,D005260,Female,416236
2,D008297,Male,377430
3,D000818,Animals,327945
4,D000328,Adult,219809
...,...,...,...
1014,D001792,Blood Platelets,2008
1015,D017667,Adipocytes,2006
1016,D020285,Cryoelectron Microscopy,2004
1017,D017024,"Chemotherapy, Adjuvant",2003


In [3]:
def get_data_for_next_10_descriptors( starting_row, bucket=0, num_examples=100):
    descriptor_list = list(descriptor_counts[starting_row:(starting_row+10)]['descriptor_id'])
    dstr = ','.join([f"'{d}'" for d in descriptor_list ])
    sql = f"select * from fn_get_dataset_for_descriptors({bucket}, {num_examples}, {dstr})"
    df = pd.read_sql(sql, con=engine)
    df['vector'] = [eval(v) for v in df['vector']]
    df.columns = [*df.columns[0:4], *descriptor_list]
    return df

In [4]:
for batch_number in range(100):
    print(f"### Batch number {batch_number} ###")
    descriptor_models = {}
    
    starting_row = batch_number * 10
    df = get_data_for_next_10_descriptors(starting_row, bucket=0, num_examples=200)
    flag_cols = df.columns[4:14]
    
    X_train = df['vector'].tolist()
    for flag_col in flag_cols:
        y_train = df[flag_col].values
        clf = LogisticRegressionCV(cv=5, scoring='roc_auc', n_jobs=-1, max_iter=10000)
        clf.fit(X_train, y_train)
        descriptor_models[flag_col] = clf
        print(flag_col, get_mean_xval_score_for_binary_classifier(clf))

    with open(f'descriptor_models_200samples_batch{batch_number:02d}.pkl', 'wb') as f:
        pickle.dump(descriptor_models, f, pickle.HIGHEST_PROTOCOL)

### Batch number 0 ###
D006801 0.9141724477152138
D005260 0.7716954514978116
D008297 0.7692757976325661
D000818 0.947664288651662
D000328 0.7773120332827308
D008875 0.8160141799066963
D000368 0.8151357349594649
D051379 0.9303825057255389
D055815 0.7344618383885549
D000293 0.7884672496578288
### Batch number 1 ###
D002648 0.8366873022624635
D012189 0.8478552221103886
D003430 0.8212772298030661
D012307 0.783522868680707
D000369 0.7437915111543818
D016896 0.8333452709116903
D015398 0.9356663922238573
D045744 0.962535368133231
D000086382 0.8915331096225989
D011795 0.8679506694810881
### Batch number 2 ###
D011379 0.8574245113052431
D011446 0.831034013020276
D011247 0.9223192303386474
D002675 0.9031578256840069
D051381 0.8847201946397993
D004195 0.8488366116391941
D049109 0.9012592464262908
D009154 0.8863041567803034
D015415 0.7589745994081205
D013997 0.7145121571112216
### Batch number 3 ###
D008810 0.922824168713493
D007223 0.890784989456567
D000465 0.9167676958920611
D000086402 0.9057306



D008954 0.8388282505675775
D005838 0.9005031889970188
D014481 0.952192238838421
D011485 0.8766088762570285
D014408 0.9218859781626666
D000900 0.9366094160834916
D000970 0.8993803150901402
D002454 0.8894908254453876
D015658 0.9411343084297605
D059467 0.8678483766983037
### Batch number 7 ###
D008175 0.9244992810088851
D015994 0.9590589302648114
D003198 0.9075320157067359
D016207 0.907148867184597
D015870 0.7434271237766101
D000595 0.8791704922070771
D009474 0.933704250189271
D017422 0.8970382500906423
D014157 0.8163441620851299
D009765 0.9293091511642022
### Batch number 8 ###
D018345 0.8760528358359141
D000367 0.8937490112836597
D002465 0.8992236976880557
D002470 0.8508346114619607
D018384 0.8798976495763029
D001426 0.9382472915261127
D018570 0.9030298953442927
D018450 0.7877684145009995
D008969 0.8902329204996466
D020022 0.888474433140425
### Batch number 9 ###
D012984 0.8843557130486867
D003924 0.9300544070975864
D008962 0.9021807868002204
D001483 0.8585192554529553
D008807 0.9407519



D002908 0.9059740778817902
D005947 0.8868300372297048
D007700 0.902371021567947
D007668 0.8825029573277071
D044127 0.8934062112204737
D000069550 0.9553071820447132
### Batch number 21 ###
D012424 0.9405713200725284
D034741 0.8864207645568021
D006282 0.9393885031618513
D011024 0.9357759036869358
D001315 0.8814810565850877
D020411 0.9101050612288466
D004285 0.9211043902924791
D034622 0.8964346208091356
D001499 0.9115754551685142
D019008 0.9415280318520726
### Batch number 22 ###
D002118 0.8926651626235422
D004847 0.8723752406179012
D010455 0.8670467283964213
D001007 0.9601354146643386
D005347 0.8806323530825857
D011320 0.9681857404917237
D001794 0.9522996474503753
D005434 0.8741805313542347
D018507 0.9533335301423236
D014611 0.9571472442455402
### Batch number 23 ###
D005247 0.9458457412505876
D018515 0.9549854252660619
D005075 0.9625180660344796
D002289 0.9558305270366907
D015897 0.939331632960332
D008603 0.9752149954026794
D012441 0.9636798676398508
D009361 0.9568097391504853
D016328 0



D064307 0.9160134467170783
D001343 0.9191535028875635
D019587 0.931625100552331




D008957 0.9582766972184625
D059630 0.9597202593435347
D004797 0.8801187303073859
D012336 0.9183875497996146
D006367 0.9056769218345391
### Batch number 25 ###
D017423 0.8779295837760653
D000230 0.9006218765334193
D030541 0.898069638152528
D003920 0.9205371728690276
D001835 0.9239622766018893
D062085 0.9227691866351375
D000914 0.9727375828919669
D010084 0.9374424450252994
D005060 0.9153348305060479
D010190 0.8957385419851578
### Batch number 26 ###
D016571 0.9721810057956631
D005858 0.9296428036864753
D013312 0.9113194165316727
D017360 0.9478852390200512
D014779 0.9781123907795559
D001932 0.9498845667999272
D016415 0.9128137592026221
D012907 0.9559926373353578
D007328 0.9498719066146444
D004784 0.9746186295892041
### Batch number 27 ###
D013334 0.9779868232863553
D014407 0.9486766385555804




D051057 0.9371846298737021
D011634 0.9394150583918895
D016678 0.9590818118670283
D000998 0.9577117646880338
D008288 0.9457992689164266
D006624 0.9524326058297253
D001522 0.9232950671152471
D044822 0.9734339985197588
### Batch number 28 ###
D013274 0.921021396438773
D007113 0.9283409877848712
D044466 0.9246602327203481
D002294 0.9154954942526349
D012042 0.9516694701961148
D042783 0.9236389508950132
D005243 0.9599250922488242
D012367 0.9328509021534014
D001696 0.984837220100024
D012867 0.8742950857016287
### Batch number 29 ###
D008070 0.9486382809470155
D015497 0.9492773795772628
D005240 0.9000608668448171
D000073640 0.947412436253031
D002462 0.924785527739213
D007194 0.9185891914482334
D003362 0.9223102867639899
D002453 0.9384291105973166
D012987 0.9863281778702593
D018572 0.9551218727810331
### Batch number 30 ###
D053758 0.941874941773156
D056910 0.9214468945308452
D007858 0.9747520148531834
D009687 0.894710036816524
D015850 0.9226570828252629
D006761 0.9611832378830327
D006863 0.906



D005002 0.9404247362854022
### Batch number 32 ###
D007074 0.9348001019802592
D004249 0.9354757358792345
D012044 0.8457901216857324
D013548 0.9077355723857753
D003937 0.9417195613521254




D008055 0.9034771054106508
D014774 0.9588491405981514
D013329 0.9299767269369154
D011594 0.9696807670429664
D010100 0.911388871997044
### Batch number 33 ###
D001931 0.9871650988001062
D006304 0.9498856322267549
D004734 0.8789900514386494
D006706 0.8340914672680233
D004269 0.9437090206701247
D000893 0.936789839704911
D055785 0.8948138389883642
D013019 0.9613302577476487
D012313 0.8627393631883141
D004331 0.9005026155967727
### Batch number 34 ###
D047428 0.9185082581056149




D021381 0.9218352529775299
D015027 0.9327933059530024
D008403 0.9429899732839292
D007167 0.940766420298661
D009426 0.8697508255928345
D011110 0.922016959750095
D012306 0.8700480022533215
D001291 0.9629966732948297
D006657 0.9209549177904526
### Batch number 35 ###
D010300 0.93496667959451
D007814 0.9516441114134556
D016159 0.9483661925839437
D013058 0.9243733354557395
D016014 0.8388908007725264
D002645 0.9308391047815869
D003657 0.9471271730336598
D012574 0.9685942648683575
D001706 0.9270510951628408
D001523 0.9367107710366204
### Batch number 36 ###
D003131 0.9317777240929297
D004569 0.9891852877320504
D006333 0.964740964658484
D015394 0.9169914143935314
D002467 0.9031102038039919
D006293 0.9827657782269232
D050356 0.9293003573588248
D014376 0.9369989681522766
D062105 0.9236716227922692
D018797 0.9154069491255349
### Batch number 37 ###
D008545 0.9567447024627713
D007362 0.9642383969469449
D007251 0.9467524852169256
D053719 0.921649213607113
D003142 0.9690916518883915
D007333 0.971756



D003247 0.9663032931207569
D016680 0.9674825025825815
D005843 0.9113113976633873
D002352 0.9160108567287656
### Batch number 38 ###
D010146 0.9196821952343377
D040641 0.9839440118118139
D014463 0.9425397805626323
D018360 0.9477431161199114
D009042 0.933440931014807
D056004 0.9525643150450838
D021621 0.9475144138005176
D015964 0.9673517586322335
D015438 0.9497765054569858
D013030 0.9106444684886655
### Batch number 39 ###
D000222 0.9423448582601249
D006239 0.9210143395918623
D013234 0.9599718542282865
D007407 0.9119875180847199
D010290 0.9538979549006024
D057566 0.9097082388388591
D005787 0.9199397765752231
D018414 0.9706209203500096
D010342 0.9147259883310752
D009419 0.9333406571263094
### Batch number 40 ###
D017346 0.9318365763266993
D063646 0.8945065084930583
D002583 0.8850910982940443
D004739 0.8949463931315677
D004781 0.9457264726876268
D002170 0.8812410653708088
D017063 0.8664049423331679
D058750 0.9337002218260869
D009504 0.9133462898449872




D012008 0.8755255278652978
### Batch number 41 ###
D009169 0.9466484733906533
D009035 0.972572868846324
D009752 0.9365411488166563
D056726 0.9201995143168007
D017434 0.9528124732491461
D018517 0.9702594708733155
D001172 0.9282634045896879
D015496 0.9447875957218747
D009203 0.9451935167185151
D000273 0.9369400620183906
### Batch number 42 ###
D011159 0.8714227501038104
D006339 0.9640108792133854
D060825 0.9555253097599061
D024881 0.9439158195080465
D015894 0.9778194906378801
D008207 0.9820085119441317
D001249 0.9340825834463917
D008019 0.9086266367710547
D018441 0.9763268391189598




D006306 0.9132493864119027
### Batch number 43 ###




D002843 0.9179874891061439
D003906 0.9756471550714622
D001185 0.9448359118185788
D017343 0.9805213489584237
D016679 0.9652253453936218
D004636 0.9824212142077865
D019869 0.9366084722760831
D008856 0.8828182374773306
D016601 0.8678354624240349
D029721 0.9448414284977085
### Batch number 44 ###
D014943 0.935193904265258
D010963 0.9517224106905375
D009206 0.9545471205125737
D042461 0.9366474533776378
D017216 0.9758413864248563
D010944 0.9653207437837186
D005006 0.918103042740867
D011817 0.9049092220108543
D000072417 0.9442289149517015
D013624 0.9077358180426357
### Batch number 45 ###
D007902 0.9339079095929039
D004522 0.8919885350840033
D007492 0.8625000128458797
D005638 0.9473564964138295
D018805 0.9178226563252544
D000704 0.8316246367785437
D004359 0.9155724069161992
D055432 0.9424628515658735
D017144 0.9602412766174627




D004198 0.893121947433122
### Batch number 46 ###
D051436 0.9593116088052481
D008322 0.8794589727593308
D018745 0.978782211576213
D002851 0.8775928388608685
D020935 0.8966508409984103
D013211 0.9445007553811667
D066246 0.9116882689315249
D002986 0.9297992930655328
D004777 0.9117939837172513
D014280 0.9355442939332491
### Batch number 47 ###
D048868 0.8975138045941323
D032383 0.9612181647170143




D002244 0.8620383609545778
D005227 0.8647418897526424
D012275 0.9225761754931684
D016923 0.8823780795273795
D057231 0.9745216873595968
D007501 0.880321228040722
D000284 0.9221189564250297
D000596 0.8390170683619825
### Batch number 48 ###
D003110 0.913922205247788
D007413 0.9039342393415172
D019562 0.9568503648149616
D050177 0.9600815775242421
D009043 0.9399181344603988
D058977 0.9091985099095503
D000208 0.8786280183685706
D005810 0.923079287051151
D012098 0.9317642404255952
D018836 0.8953836334650032
### Batch number 49 ###
D014764 0.9517545788994077
D000383 0.9766122197098364
D008213 0.9472824911005256
D002784 0.9094564155621712
D014945 0.9311741263285148
D029701 0.9581469223357789
D002540 0.9690305867513501
D012097 0.8811232963628719
D002853 0.8774640219321265
D000078202 0.9878917620895921
### Batch number 50 ###
D002097 0.9241905834915825
D009682 0.934636543440936
D029424 0.949679978432538
D029681 0.980373703811364
D055088 0.9563982977506068
D059305 0.9724427132817433
D011499 0.930



D002448 0.9215532333086148
### Batch number 53 ###
D016778 0.9575946177790898
D012890 0.9682396031111103
D003713 0.9325998580400634




D016229 0.9668747070371257




D010820 0.9597366254586894
D015374 0.9574870384395829
D007371 0.9427167405700446
D010465 0.9455300537742296
D064113 0.9496514754230709
D010957 0.9388127137596106
### Batch number 54 ###
D004272 0.9494155620059799
D001692 0.9499808246338072
D010147 0.9593938999359978
D015397 0.9601204319561057
D010375 0.9575806988052447
D007963 0.9397212553732895
D003140 0.9645460962492549
D004867 0.9606083993963527
D000428 0.9506787465927233




D049268 0.9555727508877304
### Batch number 55 ###
D012111 0.9701328809655045
D017353 0.8829569397912056
D006358 0.9270769983626999
D006258 0.9032804003383881
D002945 0.9317341977379578
D002983 0.983070380163222
D058990 0.9186092568791857
D005355 0.944636929093878
D064368 0.9238712388191395
D012641 0.9632451685931611
### Batch number 56 ###
D000077321 0.9654514961974583
D003710 0.8446934125511169
D009483 0.9652160940621581
D009389 0.9634993638241841
D057134 0.970646460128958
D006291 0.9681915503736865
D016138 0.9685112897019748
D047908 0.9349804664511432
D008027 0.9589298534293398
D007630 0.9326515875955417
### Batch number 57 ###
D010012 0.9456982773904361
D017628 0.9468346697724817
D010446 0.8486332475713765
D014411 0.9127834616531153
D005909 0.8921011324587169
D004789 0.8578262750884141
D018895 0.9775780049962822
D018613 0.831128926711474
D007680 0.9125041519590804
D011859 0.9492338332610734
### Batch number 58 ###
D063990 0.899284570274409
D000255 0.9090118013112276
D005822 0.88161



D030821 0.9679159951594535
D016503 0.9282719032803779
D015800 0.9360952181834726
D012727 0.9286369859911595
D007802 0.9883319987407913
D000890 0.935670470795538
### Batch number 59 ###
D060787 0.952480579690595
D004252 0.9431756631878002
D000203 0.9367619796248059
D029968 0.9837531809811457
D017410 0.889639568078438
D035843 0.8978117419088185
D004938 0.9377343147360305
D017060 0.9414306365072381
D000831 0.9616030556338073
D008959 0.9873603429195867
### Batch number 60 ###
D000086663 0.9776708073088285
D010775 0.9382404549967044
D012559 0.9385704162731232
D017052 0.9769255474991294
D000200 0.9615504229496828
D003001 0.9891161582886939
D024821 0.9681440674540415
D001288 0.9217291457103155
D012725 0.9840347443801182
D019540 0.9006723463921297
### Batch number 61 ###
D005828 0.9854780802890474
D009569 0.879280067634992
D008103 0.9346718075357927
D011203 0.9871227108515239
D061986 0.9533705639133341
D009584 0.9461413526161099
D050197 0.9147021839996453
D003094 0.9025540023175228
D006861 0.8



D005656 0.9684633270221814
### Batch number 67 ###
D060066 0.9091688015494326
D011880 0.993613249816357
D055442 0.9244131619272693
D004279 0.9606113606714652
D011157 0.968868290903856
D016212 0.9333275755555295
D055503 0.934275904236652
D018592 0.9643806561226462
D001749 0.9192524369816182
D012923 0.976998952913282
### Batch number 68 ###
D003704 0.9580969914771709
D005190 0.9364277863294532
D014584 0.9456590807109434
D004357 0.8941261724961802
D005091 0.9222512355983536
D056945 0.9047584970146654
D047368 0.8804779458652956
D016513 0.8862031876881771
D003625 0.9296041055102909
D011993 0.8759673229039917
### Batch number 69 ###
D005191 0.9785554529994596
D009415 0.9810650088522278
D001281 0.9679950382341925
D058186 0.96179088413353
D005081 0.9767113151490896
D005798 0.9715204010460587
D004317 0.978815756159819
D011550 0.9542758158981368
D004435 0.9377754920303479
D000067877 0.9526191668496038
### Batch number 70 ###
D000935 0.9558162319993736
D005865 0.9750873086097618
D007422 0.8817197



D005658 0.942895124655273
D053768 0.9388335442149252
D011295 0.9882781585019172
D009664 0.9432961122558752
### Batch number 72 ###
D037521 0.965811811790298
D013569 0.9496585270660823
D009368 0.9558228912090858
D011248 0.9791480157902548
D017124 0.9418366440559597
D007306 0.968954810107275
D001859 0.9225456657195072
D001842 0.9361174132316108
D008568 0.9534333172838553
D000067128 0.9445836812503889
### Batch number 73 ###
D007182 0.977202174040697
D006439 0.9322690949901492
D003287 0.9421679816400971
D012016 0.8273548085344615
D006442 0.9497324267235874
D001323 0.947171767540817
D001253 0.9726491269274881
D003313 0.9839678976292812
D030361 0.9728074905542327
D020125 0.9533206915138767
### Batch number 74 ###
D009473 0.9808342504958523
D025521 0.9344369943545414
D011134 0.9415785471509235
D004283 0.9486532654897459
D003921 0.9647074796980846
D001823 0.974281842863895
D013203 0.9547465336976971
D018719 0.9644987435931238
D012988 0.9896869770299734
D016638 0.9757291637721524
### Batch num



D011336 0.9215906089026799
D002940 0.9528456984490296
D016036 0.982259705040609
D013816 0.9461425623006361
D005419 0.9479630332085938
D065928 0.9736647495649973
D055808 0.9107018099660398
D001717 0.9359051663403566
D004365 0.9462710808152114
### Batch number 77 ###
D013154 0.9186357863358717
D007694 0.9362333362076773
D017433 0.9595981924848953
D016922 0.9360398058185277
D011044 0.9462886857942255




D056915 0.9396754880282083
D010788 0.9776144586947095




D023421 0.8938120609886232
D010818 0.9864967777631213
D000073336 0.904403278999618
### Batch number 78 ###
D015519 0.9678903005573343




D013499 0.9215245064147964
D011597 0.9830304144775148
D013559 0.9786613893749614
D011897 0.8361809943253318
D002384 0.9591313951761877
D030801 0.9422807338080018
D011518 0.9447316987102077
D019047 0.9884463601187535
D002545 0.9475596960908167
### Batch number 79 ###
D017930 0.9346115697902802
D004353 0.9006830544379252
D030562 0.979781853476692
D061108 0.9862944450826324
D004338 0.9021671413953959
D011014 0.9343924995607866
D007719 0.9756200214204265
D017048 0.9610139746176358
D019966 0.9639051882614071
D005285 0.9706394078241072
### Batch number 80 ###
D018696 0.9060280376655936
D060449 0.9399599236552829
D017397 0.9760920834026864
D019636 0.8748339527023724
D001688 0.9014515399804758
D061307 0.9243100834948784
D015470 0.9420483867863803
D020134 0.9571292776711084
D011787 0.9940891072417068
D061067 0.92102620145349
### Batch number 81 ###
D019943 0.9035973759793631
D000431 0.9246253420259627
D007303 0.9685389109162807
D056186 0.9605660417046431
D008562 0.8303987754310578
D043562 0.861



D015533 0.8779388999104528
D004261 0.8917144870860774
D054875 0.8993694074621607
D004334 0.9662889500360319
D011995 0.9184719153203522
### Batch number 86 ###
D019936 0.9462428950474706
D000852 0.9779516516292066
D014018 0.9335909790454675
D003715 0.9530427032675682
D000906 0.9043214885840405
D002149 0.972250911135341
D063731 0.9821288466031657
D010003 0.9547488880701402
D016753 0.9160123504094055
D016277 0.9782337540824511
### Batch number 87 ###
D003627 0.8983533593840832
D057286 0.9311233423251345
D055786 0.9003636511628802
D004364 0.8698639332742882
D056426 0.9294114616487089
D017281 0.9376637601893639
D017510 0.9628802467427222
D012121 0.9597305202670059
D064593 0.9585575809407461
D003718 0.8871974633507603
### Batch number 88 ###
D009929 0.8951428753704512
D012463 0.9172460337890265
D007166 0.9313791197210797
D014944 0.9399096892530509
D000995 0.9555766036867747
D057185 0.9722967971015682
D000330 0.9632700965846024
D046529 0.9224142146905191
D014026 0.957679697953458
D013314 0.96



D015032 0.937136633651462




D001459 0.9226113915234819
D017404 0.9235804214748622
D000072138 0.9648432766773472
### Batch number 90 ###
D006224 0.8887619877853934
D013482 0.9495193804579047
D016273 0.9496374512424335
D003470 0.9439107949423334
D001290 0.9508015773680809
D003593 0.9190045673125962




D064370 0.9737609725009031
D017132 0.9536897072250785
D013318 0.9887879169620337
D005239 0.939643397860365
### Batch number 91 ###
D055118 0.9517440759074999
D016366 0.9319017777036691
D001294 0.9610248635249372
D025941 0.927387419376364
D013116 0.9492811901541227
D012038 0.9217400350712112
D010349 0.9013020951559714
D015966 0.9604520629540708
D004730 0.940077796501868
D019070 0.9089295390415646
### Batch number 92 ###
D000701 0.968199590616489
D008159 0.9745512343120313
D000081246 0.8733221186190884
D000072283 0.9189116894086201
D007109 0.8461298183079375
D006526 0.9432151908956838
D050397 0.9887551713285632
D011956 0.8367425103740838
D000069600 0.9840648661214685
D011725 0.8471234317034773
### Batch number 93 ###
D004721 0.9239892508700315
D002980 0.9854146664033598
D001854 0.9206326944180017
D003299 0.9882486334116477
D015820 0.8943186435823259
D011743 0.8752343734746987
D000072669 0.9173536363423533
D018380 0.9380170841416999
D047108 0.9409607298170706
D005919 0.9785480121595317
##



D051858 0.889642521626398
D001369 0.9586400274908137
D010353 0.9642621212209095


In [None]:
# for descriptor, clf in descriptor_models.items():
#     print(descriptor, get_mean_xval_score_for_binary_classifier(clf))