In [1]:
%matplotlib inline
import psycopg2
import psycopg2.extras
import numpy
from matplotlib import pyplot

In [2]:
with open("/global/homes/r/raknop/secrets/decatdb_decat_decat") as ifp:
    (user, passwd) = ifp.readline().strip().split()
db = psycopg2.connect(f"dbname='decat' user='{user}' password='{passwd}' host='decatdb.lbl.gov'",
                      cursor_factory=psycopg2.extras.DictCursor)

In [3]:
# Total numbers of vetted objects, and the number of people who've vetted them
q = ( 'SELECT num,COUNT(num) as count FROM ( SELECT COUNT(id) AS num FROM scanscore GROUP BY object_id ) AS subquery '
      'GROUP BY num ORDER BY num' )
cursor = db.cursor()
cursor.execute(q)
rows = cursor.fetchall()
print( f'{"#":3s}  {"# reviewed":6s}' )
for row in rows:
      print( f'{row["num"]:3d}  {row["count"]:6d}' )


#    # reviewed
  1   14191
  2   12507
  3    3293
  4     389
  5      21
  6       1


In [4]:
# Let's check consistency. Somebody really clever with SQL could probably do this all in one query,
#  but it's a tangled way to do it, and it's easier to see the logic just doing it in python.
# I'm going to start by just getting all of the vetting data.
db.rollback()
q = 'SELECT * FROM scanscore ORDER BY object_id'
cursor = db.cursor()
cursor.execute(q)
scanscore = cursor.fetchall()

In [5]:
# We want to group them by number of good votes and number of bad votes for the same object.
ngoodbad = {}
curobjid = -1
ngood = 0
nbad = 0
maxgoodbad = 0
for row in scanscore:
    if row['object_id'] != curobjid:
        if curobjid >= 0:
            if not ngood in ngoodbad:
                ngoodbad[ngood] = {}
            if not nbad in ngoodbad[ngood]:
                ngoodbad[ngood][nbad] = 0
            ngoodbad[ngood][nbad] += 1
            maxgoodbad = max(maxgoodbad, ngood, nbad)
        curobjid = row['object_id']
        ngood = 0
        nbad = 0
    if row['goodbad'] == "good":
        ngood += 1
    else:
        nbad += 1
print( 'good  bad      # objects' )
nfullyconsistent = 0
nmultiplerated = 0
for ngood in range(0,maxgoodbad+1):
    for nbad in range(0,maxgoodbad+1):
        if ngood+nbad > 1:
            nmultiplerated += ngoodbad[ngood][nbad] if ngood in ngoodbad and nbad in ngoodbad[ngood] else 0
            if ( ngood == 0 ) or ( nbad == 0 ):
                nfullyconsistent += ngoodbad[ngood][nbad] if ngood in ngoodbad and nbad in ngoodbad[ngood] else 0
        if ( ngood+nbad > 1 ) and ngood in ngoodbad and nbad in ngoodbad[ngood]:
            print( f'{ngood:4d}  {nbad:3d}      '
                   f'{str(ngoodbad[ngood][nbad]) if ngood in ngoodbad and nbad in ngoodbad[ngood] else "":6s}' )
print( f'# multiply rated:   {nmultiplerated}' )
print( f'# fully consistent: {nfullyconsistent}  ({nfullyconsistent/nmultiplerated:.3f})' )

good  bad      # objects
   0    2      10270 
   0    3      2528  
   0    4      289   
   0    5      12    
   1    1      1437  
   1    2      399   
   1    3      37    
   1    4      4     
   2    0      800   
   2    1      239   
   2    2      26    
   3    0      127   
   3    1      25    
   3    2      2     
   4    0      12    
   4    1      1     
   4    2      1     
   5    0      2     
# multiply rated:   16211
# fully consistent: 14040  (0.866)


In [6]:
# It occurs to me that the interactive nature of jupyter scripts encourages us to write
#  bigass long single routines rather than functions.  Watch me buck the trend. 
# (Of course, the formatting is still *terrible* for actual code as opposed to
# to interactive script, but, whatever.)

def statsforuser(user):
    maxnumusers = 6
    goodids = set( [ row['object_id'] for row in scanscore
                     if row['username']==user and row['goodbad']=='good' ] )
    badids = set( [ row['object_id'] for row in scanscore
                    if row['username']==user and row['goodbad']=='bad' ] )
    usergoodotherbad = numpy.zeros(maxnumusers+1, dtype=int)
    usergoodothergood = numpy.zeros(maxnumusers+1, dtype=int)
    userbadotherbad = numpy.zeros(maxnumusers+1, dtype=int)
    userbadothergood = numpy.zeros(maxnumusers+1, dtype=int)
    curobj = -1
    for row in scanscore:
        if row['object_id'] != curobj:
            if curobj >= 0:
                if curobj in goodids:
                    usergoodotherbad[notherbad] += 1
                    usergoodothergood[nothergood] += 1
                elif curobj in badids:
                    userbadotherbad[notherbad] += 1
                    userbadothergood[nothergood] += 1
            curobj = row['object_id']
            notherbad = 0
            nothergood = 0
        if row['username'] != user:
            if row['goodbad'] == 'good':
                nothergood += 1
            else:
                notherbad += 1
    print( f'\n\nObjects ranked by {user}' )
    print( f'User score      # others said good                    # others said bad' )
    outstr = '        '
    for i in range(maxnumusers+1):
        outstr += f' {i:6d}'
    outstr += '    '
    for i in range(maxnumusers+1):
        outstr += f' {i:6d}'
    print( outstr )
    outstr = ' good   '
    for i in range(maxnumusers+1):
        outstr += f' {usergoodothergood[i]:6d}'
    outstr += '    '
    for i in range(maxnumusers+1):
        outstr += f' {usergoodotherbad[i]:6d}'
    print(outstr)
    outstr = ' bad    '
    for i in range(maxnumusers+1):
        outstr += f' {userbadothergood[i]:6d}'
    outstr += '    '
    for i in range(maxnumusers+1):
        outstr += f' {userbadotherbad[i]:6d}'
    print(outstr)

# (I also am really annoyed by jupyter scripts because I have to use the web browser
#  as a text editor instead of the one that I know very well.  I guess this is 
#  why we have cut and paste....)

# OK, all of this was so that we could check consistency by user.

q = 'SELECT username FROM scanusers ORDER BY username'
cursor.execute(q)
users = [ row['username'] for row in cursor.fetchall() ]

for user in users:
    statsforuser(user)




Objects ranked by alexis
User score      # others said good                    # others said bad
              0      1      2      3      4      5      6          0      1      2      3      4      5      6
 good       948    522    114     11      2      0      0       1157    377     60      2      1      0      0
 bad      13573    820    110      9      1      0      0       5754   6446   2029    272     12      0      0


Objects ranked by anna
User score      # others said good                    # others said bad
              0      1      2      3      4      5      6          0      1      2      3      4      5      6
 good         0      0      0      0      0      0      0          0      0      0      0      0      0      0
 bad          0      0      0      0      0      0      0          0      0      0      0      0      0      0


Objects ranked by autumn
User score      # others said good                    # others said bad
              0      1      2      3   

In [7]:
# You can also do a comparison between two users
def compareusers( user1, user2 ):
    user1good = set( [ row['object_id'] for row in scanscore
                        if row['username']==user1 and row['goodbad']=='good' ] )
    user1bad = set( [ row['object_id'] for row in scanscore
                        if row['username']==user1 and row['goodbad']=='bad' ] )
    user2good = set( [ row['object_id'] for row in scanscore
                        if row['username']==user2 and row['goodbad']=='good' ] )
    user2bad = set( [ row['object_id'] for row in scanscore
                        if row['username']==user2 and row['goodbad']=='bad' ] )
    n1good2good = 0
    n1good2bad = 0
    n1bad2good = 0
    n1bad2bad = 0
    for objid in user1good:
        if objid in user2good:
            n1good2good += 1
        elif objid in user2bad:
            n1good2bad += 1
    for objid in user1bad:
        if objid in user2good:
            n1bad2good += 1
        elif objid in user2bad:
            n1bad2bad += 1
    
    print(  '---------------------------------' )
    print( f'|   {user1:>10s}|  good  |   bad  |' )
    print( f'|{user2:<10s}   |        |        |' )
    print( f'|-------------|--------|--------|' )
    print( f'| good        | {n1good2good:6d} | {n1bad2good:6d} |' )
    print( f'|-------------|--------|--------|' )
    print( f'| bad         | {n1good2bad:6d} | {n1bad2bad:6d} |' )
    print(  '---------------------------------' )

compareusers('rknop', 'penugent' )

---------------------------------
|        rknop|  good  |   bad  |
|penugent     |        |        |
|-------------|--------|--------|
| good        |     12 |      2 |
|-------------|--------|--------|
| bad         |     10 |     95 |
---------------------------------
