In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext line_profiler

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

In [2]:
from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Training Candidates').one()
print len(train)
dev = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates').one()
print len(dev)

28087
27896


In [3]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

L_gold_train = label_manager.load(session, train, "CDR Training Label Set")
print L_gold_train.shape
L_gold_dev = label_manager.load(session, dev, "CDR Development Label Set")
print L_gold_dev.shape

(28087, 1)
(27896, 1)


# Load `L_train` and `L_dev`

In [4]:
%%time
from snorkel.annotations import merge_annotations
from utils import binarize_LF_matrix, get_binarized_score

L_TRAIN_BLOCKS = ['1', '2', '3', '4.1', '4 T2', '4 N', '5', '6']
L_DEV_BLOCKS   = ['1', '2', '3.1', '4', '4 T2', '4 N', '5', '6']

L_train_blocks = [label_manager.load(session, train, 'LF Training Labels %s' % lfn) for lfn in L_TRAIN_BLOCKS]
L_dev_blocks   = [label_manager.load(session, dev, 'LF Development Labels %s' % lfn) for lfn in L_DEV_BLOCKS]

L_train   = merge_annotations(L_train_blocks)
L_train_b = binarize_LF_matrix(L_train)
L_dev     = merge_annotations(L_dev_blocks)
L_dev_b   = binarize_LF_matrix(L_dev)
print L_train.shape
print L_dev.shape

(28087, 1645)
(27896, 1557)
CPU times: user 21.3 s, sys: 712 ms, total: 22 s
Wall time: 21.9 s


# Load `F_train` and `F_dev`

In [5]:
%%time
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

F_train = feature_manager.load(session, train, 'Train Features')
F_dev   = feature_manager.load(session, dev, 'Train Features')
print F_train.shape
print F_dev.shape

(28087, 69885)
(27896, 69885)
CPU times: user 35.4 s, sys: 3.16 s, total: 38.6 s
Wall time: 38.4 s


# Load canonical & secondary dictionaries

In [283]:
from cPickle import load
cd = load(open('cd.pkl', 'rb'))

# Setup 2: DP LR Pre-filtering

In [7]:
from snorkel.learning import LogReg, NaiveBayes

gen_model_b = NaiveBayes()
%time gen_model_b.train(L_train_b, n_iter=10000, rate=1e-1, verbose=False)

train_marginals = gen_model_b.marginals(L_train_b)

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-3, mu=1e-6)

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Training marginals (!= 0.5):	28087
Features:			1645
CPU times: user 28 s, sys: 28 ms, total: 28 s
Wall time: 28 s
Training marginals (!= 0.5):	20038
Features:			69885
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 13889.283204	Gradient magnitude = 9832.035530
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 3078.241784	Gradient magnitude = 379.275662
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 2894.743086	Gradient magnitude = 1296.907839
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 2494.032915	Gradient magnitude = 297.457588
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 2394.454863	Gradient magnitude = 31.977057
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 2337.136536	Gradient magnitude = 28.324998
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 2295.414531	Gradient magnitude = 25.754725
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 2263.718238	Gradient magnitude = 

In [8]:
yp = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp, L_gold_dev)

P :	0.838600797791
R :	0.687374245473
F1:	0.755494125777


# Setup 2.i: Exact Match

In [None]:
from utils import get_mn_score

N = L_gold_dev.shape[0]

# Check for exact *positive* matches to the canonical dictionary
exact_match_predictions = -1 * np.ones(N)
for i,c in enumerate(dev):
    if yp[i] > 0:
        p    = c.disease.get_span().lower()
        sids = cd.term_to_sids[p]
        cids = set([cd.sid_to_cid[sid] for sid in sids if sid in cd.sid_to_cid])
        if len(cids) > 0:
            exact_match_predictions[i] = list(cids)[0]

get_mn_score(exact_match_predictions, L_gold_dev)

# Setup 2.ii: TF-IDF

In [17]:
%%time
from entity_norm import CanonDictVectorizer 

# Create a vectorizer based around this 
cd_vectorizer = CanonDictVectorizer(cd.term_to_sids, other_phrases=[])

# Vectorize the dictionary
D_pos = cd_vectorizer.vectorize_phrases(cd.pos_terms)
D_pos

|V| = 271512
CPU times: user 47.9 s, sys: 3.52 s, total: 51.5 s
Wall time: 51.1 s


In [None]:
from utils import get_pos_vector_matches
%time best_match = get_pos_vector_matches(dev, cd, cd_vectorizer, D_pos)

In [None]:
predicted = [best_match[c.id][1] if yp[i] > 0 else -1 for i,c in enumerate(dev)]
get_mn_score(predicted, L_gold_dev)

# Setup 2.iii: Using the Gen. Model directly

Only consider the points predicted positive by the discriminative model:

In [None]:
from snorkel.learning.learning_mn import assemble_mn_format, LogReg

Xs, mn_maps, mn_inv_maps, nz_idxs = assemble_mn_format(L_dev, mask=yp)

gen_model = LogReg()
gen_model.train(Xs, n_iter=100, rate=1e-2, w0=np.ones(L_dev.shape[1]))

In [None]:
N_pos_dev = sum([1 for i in range(L_gold_dev.shape[0]) if L_gold_dev[i,0] > 0])
predicted = [mn_inv_maps[i][np.argmax(m)] for i,m in enumerate(gen_model.marginals(Xs))]
get_mn_score(predicted, L_gold_dev[nz_idxs], N_total_pos=N_pos_dev)

# Setup 2.iv: SSI

### Step 1:

We used the discriminative model, trained using the training_marginals from the binary gen model over the training set, to filter both the training and test sets to predicted positives

In [9]:
yp_d_train = disc_model.predict(F_train, b=0.5)
get_binarized_score(yp_d_train, L_gold_train)

P :	0.845507246377
R :	0.753746770026
F1:	0.796994535519


In [10]:
yp_d_dev = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp_d_dev, L_gold_dev)

P :	0.838600797791
R :	0.687374245473
F1:	0.755494125777


### Step 2:

We run the multinomial generative model over the training set

In [None]:
from snorkel.learning.learning_mn import assemble_mn_format, LogReg

Xs, mn_maps, mn_inv_maps, nz_idxs = assemble_mn_format(L_train) #mask=yp_d_train)
gen_model = LogReg()
gen_model.train(Xs, n_iter=100, rate=1e-2, w0=np.ones(L_train.shape[1]))

In [None]:
from utils import get_mn_score

N_pos_train     = sum([1 for i in range(L_gold_train.shape[0]) if L_gold_train[i,0] > 0])
train_marginals = gen_model.marginals(Xs)
predicted       = [mn_inv_maps[i][np.argmax(m)] for i,m in enumerate(train_marginals)]
get_mn_score(predicted, L_gold_train[nz_idxs], N_total_pos=N_pos_train)

In [18]:
X_train = cd_vectorizer.vectorize_phrases([train[i].disease.get_span().lower() for i in nz_idxs])
X_train

<2896x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 5984 stored elements in Compressed Sparse Row format>

## TODO: Why such a big gap between len(tm) and N???

In [20]:
len(train_marginals)

2896

### Step 3:

We construct the training marginals as sparse matrix

In [21]:
from scipy.sparse import lil_matrix

Np = len(train_marginals)
K  = len(set(cd.sid_to_cid.values()))
Y  = lil_matrix((Np, K))

for i in range(Np):
    for j in range(len(train_marginals[i])):
        Y[i, int(mn_inv_maps[i][j])] = train_marginals[i][j]
Y = Y.tocsr()
Y

<2896x4790 sparse matrix of type '<type 'numpy.float64'>'
	with 4556 stored elements in Compressed Sparse Row format>

### Step 4:

We train the SSI model using the training marginals

In [22]:
#cids = [cd.sid_to_cid[list(sid for sid in cd.term_to_sids[t] if sid in cd.sid_to_cid)[0]] for t in cd.pos_terms]

In [219]:
cid_sets = [set([cd.sid_to_cid[sid] for sid in cd.term_to_sids[t] if sid in cd.sid_to_cid]) for t in cd.pos_terms]

In [220]:
from collections import defaultdict
counts = defaultdict(int)
for cid in cids:
    counts[len(cid)] += 1
counts

defaultdict(int, {1: 63394, 2: 395, 3: 4, 7: 1})

In [239]:
from entity_norm import SSIModel

model = SSIModel(D_pos, cid_sets)

In [None]:
%lprun -f model.train model.train(X_train, Y, rate=1e-2, n_iter=5, n_iter_sample=5)

In [251]:
%time model.train(X_train, Y, rate=1e-3, n_iter=1, n_iter_sample=5)

Iteration: 0
	0

d= 1.0
i: 2592
cid: 2692
p: 56562
n: 56554


d= 1.0
i: 2592
cid: 2692
p: 56562
n: 56554


d= 1.0
i: 2592
cid: 2692
p: 56562
n: 56554


d= 1.0
i: 2592
cid: 2692
p: 56562
n: 56554


d= 1.0
i: 2592
cid: 2692
p: 56562
n: 56554


d= 0.803030180745
i: 308
cid: 2765
p: 17189
n: 24022


d= 0.802823511649
i: 308
cid: 2765
p: 17189
n: 24022


d= 0.802616842552
i: 308
cid: 2765
p: 17189
n: 24022


d= 0.802410173455
i: 308
cid: 2765
p: 17189
n: 24022


d= 0.802203504359
i: 308
cid: 2765
p: 17189
n: 24022


d= 0.44542600017
i: 489
cid: 1891
p: 11910
n: 53053


d= 0.444794712447
i: 489
cid: 1891
p: 11910
n: 53053


d= 0.444163424724
i: 489
cid: 1891
p: 11910
n: 53053


d= 0.443532137
i: 489
cid: 1891
p: 11910
n: 53053


d= 0.442900849277
i: 489
cid: 1891
p: 11910
n: 53053


d= 0.501950826921
i: 1878
cid: 2036
p: 13031
n: 60660


d= 0.501262672303
i: 1878
cid: 2036
p: 13031
n: 60660


d= 0.500574517685
i: 1878
cid: 2036
p: 13031
n: 60660


d= 0.499886363068
i: 1878
cid: 2036
p: 13031

KeyboardInterrupt: 

In [353]:
i = 617
x = X_train.getrow(i)
x

<1x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [366]:
cd.cid_to_sids[L_gold_train[nz_idxs][i,0]]

{'D020345'}

In [369]:
L_train[nz_idxs].getrow(i).data

array([ 3848.,   885.,  3848.])

In [371]:
Y[i].data

array([ 0.26812765,  0.73187235])

In [374]:
import random

for it in range(10):
    t = random.random()
    for cid in Y[i].nonzero()[1]:
        t -= Y[i,cid]
        if t < 0:
            break
    print cid
    

3848
3848
3848
3848
3848
885
885
3848
3848
3848


In [354]:
train[nz_idxs[i]].disease.get_span().lower()

u'necrotizing enterocolitis'

In [355]:
for j in x.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

necrot
enterocol


In [357]:
CID = 885
cd.cid_to_sids[CID]

{'D004760'}

In [358]:
[cd.sid_to_cid[sid] for sid in cd.cid_to_sids[CID]]

[885]

In [359]:
p  = 5440
tp = D_pos.getrow(p)
for j in tp.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

enterocol


In [360]:
n  = 24094
tn = D_pos.getrow(n)
for j in tn.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

necrot
enterocol


In [361]:
1 - (x * model.W * tp.T)[0,0] - model.b * (x * tp.T)[0,0] + (x * model.W * tn.T)[0,0] + model.b * (x * tn.T)[0,0] 

1.6135501521893609

In [362]:
[cd.cid_to_sids[cid] for cid in cid_sets[p]]

[{'D004760'}]

In [363]:
[cd.cid_to_sids[cid] for cid in cid_sets[n]]

[{'D020345'}]

In [346]:
W_delta = 1e-3 * (x.T * tp - x.T * tn)

In [347]:
W_delta.nonzero()

(array([ 430,  430,  430,  431,  431,  431, 6082, 6082, 6082], dtype=int32),
 array([227, 430, 431, 227, 430, 431, 227, 430, 431], dtype=int32))

In [348]:
W_delta.data

array([ -3.04324828e-04,  -2.38895810e-04,  -2.96490983e-04,
        -5.40490376e-05,  -4.24286402e-05,  -5.26577224e-05,
         3.85515115e-04,   3.02630405e-04,   3.75591293e-04])

In [324]:
print W_delta[10703, 428]
print cd_vectorizer.inv_word_index[10703]
print cd_vectorizer.inv_word_index[428]

-0.000432747665309
tremor
famili


In [None]:
cd.word_inde

Let's find all dictionary terms...

In [292]:
m          = x * D_pos.T
match_rows = m.nonzero()[1]

In [293]:
row_to_cids = defaultdict(set)
for cid, rows in model.cid_to_rows.iteritems():
    for row in rows:
        row_to_cids[row].add(cid)

In [309]:
for r in match_rows:
    print r, [filter(lambda sid : sid[0] == 'D', list(cd.cid_to_sids[cid])) for cid in row_to_cids[r]]

56592 [['D000742']]
56591 [['D000742']]
56590 [['D040181']]
56589 [['D040181']]
56588 [['D013921']]
56586 [['D061085']]
56584 [['D040181']]
56583 [['D040181']]
56582 [['D040181']]
56581 [['D054868']]
56578 [['D054868']]
56577 [['D013921']]
56576 [['D025063']]
56574 [['D025063']]
56573 [['D013921']]
56572 [['D013921']]
56571 [['D013921']]
56569 [['D013921']]
56568 [['D013921']]
56567 [['D013921']]
56566 [['D013921']]
56565 [['D025063'], ['D013921']]
56564 [['D025063'], ['D013921']]
56562 [['D025063'], ['D013921']]
56560 [['D040181']]
56559 [['D040181']]
56558 [['D040181']]
56557 [['D040181']]
56556 [['D040181']]
56554 [['D040181']]
56547 [['D013921']]
56471 [['D013921']]
53459 [['D013921']]
53458 [['D013921']]
53455 [['D013921']]
53453 [['D013921']]
52337 [['D025861']]
52336 [['D025861']]
51197 [['D061085']]
51196 [['D061085']]
50753 [['D013921']]
50752 [['D013921']]
42014 [['D013921']]
41860 [['D013921']]
41622 [['D002658']]
40837 [['D013921']]
39624 [['D040181']]
39623 [['D040181']]
3

In [295]:
p  = 56562
tp = D_pos.getrow(p)
tp

<1x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [296]:
tp.nonzero()

(array([0], dtype=int32), array([13057], dtype=int32))

In [297]:
for j in tp.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

thrombocytopenia


The CID links to a **C** entry which links to _D013921 Thrombocytopenia_...

**cd.cid_to_sid should be cd.cid_to_sids...!**  How could this be affecting performance?

In [298]:
n  = 56554
tn = D_pos.getrow(n)
tn

<1x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [299]:
tn.nonzero()

(array([0], dtype=int32), array([13057], dtype=int32))

In [300]:
for j in tn.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

thrombocytopenia


In [301]:
cd.term_to_sids['thrombocytopenia']

{'D013921'}

In [302]:
[cd.cid_to_sids[cid] for cid in model.cid_sets[p]]

[{'C535296',
  'C535362',
  'C535449',
  'C535484',
  'C535486',
  'C535733',
  'C535918',
  'C536305',
  'C536519',
  'C536580',
  'C536682',
  'C536796',
  'C536798',
  'C536801',
  'C536803',
  'C536987',
  'C537635',
  'C537636',
  'C537639',
  'C537641',
  'C537643',
  'C537763',
  'C537804',
  'C537810',
  'C537813',
  'C537818',
  'C538036',
  'C538087',
  'C538105',
  'C538288',
  'C538309',
  'C538356',
  'C563051',
  'C563382',
  'C563482',
  'C564508',
  'C565001',
  'C565160',
  'C565370',
  'C566980',
  'C567439',
  'C567810',
  'C579849',
  'C579850',
  'C579873',
  'C580205',
  'D025063'},
 {'C535507',
  'C535544',
  'C535643',
  'C535982',
  'C536380',
  'C536899',
  'C536944',
  'C537831',
  'C538176',
  'C564051',
  'C564237',
  'C564771',
  'C565328',
  'C566000',
  'C566060',
  'C567438',
  'C567487',
  'C567747',
  'D013921'}]

In [304]:
[cd.cid_to_sids[cid] for cid in model.cid_sets[n]]

[{'C535281',
  'C535331',
  'C535338',
  'C535380',
  'C535532',
  'C535559',
  'C535576',
  'C535941',
  'C535975',
  'C535981',
  'C536029',
  'C536078',
  'C536122',
  'C536159',
  'C536181',
  'C536197',
  'C536358',
  'C536359',
  'C536366',
  'C536410',
  'C536424',
  'C536426',
  'C536497',
  'C536533',
  'C536643',
  'C536671',
  'C536692',
  'C536703',
  'C536757',
  'C536761',
  'C536857',
  'C536970',
  'C537102',
  'C537125',
  'C537149',
  'C537179',
  'C537241',
  'C537315',
  'C537340',
  'C537409',
  'C537466',
  'C537849',
  'C537921',
  'C538057',
  'C538165',
  'C538326',
  'C538336',
  'C538337',
  'C538362',
  'C538388',
  'C538399',
  'C538416',
  'C538613',
  'C562478',
  'C562515',
  'C562664',
  'C562782',
  'C562829',
  'C563023',
  'C563065',
  'C563124',
  'C563134',
  'C563156',
  'C563238',
  'C563670',
  'C564041',
  'C564042',
  'C564048',
  'C564049',
  'C564050',
  'C564052',
  'C564053',
  'C564054',
  'C564056',
  'C564065',
  'C564066',
  'C564072',

In [262]:
ps = model.cid_to_rows[CID]
p in ps

True

In [263]:
n in ps

False

In [73]:
ns = list(set(range(model.T)).difference(ps))

In [76]:
Z = model.D * x.T
Z

<63794x1 sparse matrix of type '<type 'numpy.float64'>'
	with 388 stored elements in Compressed Sparse Row format>

In [78]:
matches = Z.data.argsort()

In [79]:
Zn = Z[ns]
Zn

<63774x1 sparse matrix of type '<type 'numpy.float64'>'
	with 378 stored elements in Compressed Sparse Row format>

In [80]:
Zn = Zn.tocoo()

In [86]:
tn = D_pos.getrow(ns[Zn.row[Zn.data.argmax()]])
tn

<1x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [87]:
for j in tn.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

depend
cocain


### More debugging below...

In [95]:
I = sparse.identity(X_dev.shape[1], format='csr')
I

<271513x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 271513 stored elements in Compressed Sparse Row format>

In [231]:
X_dev = cd_vectorizer.vectorize_phrases([c.disease.get_span().lower() for i,c in enumerate(dev) if yp_d_dev[i] > 0])

<3259x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 7448 stored elements in Compressed Sparse Row format>

In [232]:
X_train_labels = np.array([L_gold_train[i,0] for i in nz_idxs])

In [247]:
predicted = 0
correct   = 0
for i in range(X_train.shape[0]):
    yp = model.predict(X_train.getrow(i))
    if yp is not None:
        predicted += 1
        
        # NOTE: we assume we're doing disambiguation perfectly here!!!
        if X_train_labels[i] in yp:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(N_pos_train)
f1     = (2 * prec * recall) / (prec +  recall)
print "P:\t", prec
print "R:\t", recall
print "F1:\t", f1

P:	0.757251381215
R:	0.566666666667
F1:	0.64824120603


In [248]:
predicted = 0
correct   = 0
for i in range(X_train.shape[0]):
    yp = model.predict(X_train.getrow(i), W=I)
    if yp is not None:
        predicted += 1
        
        # NOTE: we assume we're doing disambiguation perfectly here!!!
        if X_train_labels[i] in yp:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(N_pos_train)
f1     = (2 * prec * recall) / (prec +  recall)
print "P:\t", prec
print "R:\t", recall
print "F1:\t", f1

P:	0.75717744725
R:	0.565633074935
F1:	0.647537346546


In [235]:
X_dev_labels = np.array([L_gold_dev[i,0] for i in range(len(dev)) if yp_d_dev[i] > 0])

In [249]:
N_pos_dev = sum([1 for i in range(L_gold_dev.shape[0]) if L_gold_dev[i,0] > 0])

predicted = 0
correct   = 0
for i in range(X_dev.shape[0]):
    yp = model.predict(X_dev.getrow(i))
    if yp is not None:
        predicted += 1
        
        # NOTE: we assume we're doing disambiguation perfectly here!!!
        if X_dev_labels[i] in yp:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(N_pos_dev)
f1     = (2 * prec * recall) / (prec +  recall)
print "P:\t", prec
print "R:\t", recall
print "F1:\t", f1

P:	0.687674635206
R:	0.557092555332
F1:	0.615534250382


In [250]:
N_pos_dev = sum([1 for i in range(L_gold_dev.shape[0]) if L_gold_dev[i,0] > 0])

predicted = 0
correct   = 0
for i in range(X_dev.shape[0]):
    yp = model.predict(X_dev.getrow(i), W=I, b=0.0)
    if yp is not None:
        predicted += 1
        
        # NOTE: we assume we're doing disambiguation perfectly here!!!
        if X_dev_labels[i] in yp:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(N_pos_dev)
f1     = (2 * prec * recall) / (prec +  recall)
print "P:\t", prec
print "R:\t", recall
print "F1:\t", f1

P:	0.690721649485
R:	0.556086519115
F1:	0.616134875296


### Debugging...

# How much does disambiguation matter??

In [195]:
incorrect = []
for i in range(X_dev.shape[0]):
    if i % 100 == 0:
        print i
    yp = model.predict(X_dev.getrow(i))
    if yp is not None:
        if X_dev_labels[i] not in yp:
            incorrect.append(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200


In [196]:
len(incorrect)

1080

In [102]:
len(incorrectorrect)

1133

In [208]:
ic = incorrect[0]

In [209]:
x = X_dev.getrow(ic)
x

<1x271513 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [210]:
for j in x.nonzero()[1]:
    print cd_vectorizer.inv_word_index[j]

parkinson


In [226]:
cd.cid_to_sid[X_dev_labels[ic]]

'D010302'

In [212]:
cd.cid_to_sid[model.predict(x)]

'C537177'

In [224]:
cd.sid_to_cid['C537177']

3896

In [227]:
model.cid_to_rows[X_dev_labels[ic]].intersection(model.cid_to_rows[3896])

set()

In [228]:
model.cid_to_rows[3896]

{24611,
 24612,
 24613,
 24614,
 24615,
 24616,
 24617,
 24618,
 24619,
 24620,
 24621,
 24622,
 24623,
 24624,
 24625,
 24626,
 24627,
 24628,
 24629,
 24630,
 24631,
 24632,
 24633,
 24634,
 24635,
 24636,
 24637,
 24638,
 24639,
 24640,
 24641,
 24642,
 24643,
 24644,
 24645,
 24646,
 24647,
 24648,
 24649,
 45610,
 45611,
 45612,
 45613,
 45614,
 45615,
 45616,
 45617,
 45618,
 51730,
 51745,
 51756,
 51757,
 51758,
 52102,
 52103,
 59020,
 59021,
 59022,
 59697,
 59698,
 59699,
 60319,
 60320,
 60929,
 60930,
 61510,
 61511,
 61512,
 61513,
 62117,
 62714,
 63334}

In [190]:
s = x * model.W * model.D.T
s = s.tocoo()

In [191]:
s.data

array([ 0.0011054 ,  0.0011054 ,  0.0011054 , ...,  0.30884806,
        0.30884806,  0.30884806])

### TF-IDF baseline...

In [200]:
from scipy import sparse
I = sparse.identity(X_train.shape[1], format='csr')

predicted = -1 * np.ones(X_train.shape[0])
for i in range(X_train.shape[0]):
    yp = model.predict(X_train.getrow(i), W=I, b=0.0)
    if yp is not None and yp > 0:
        predicted[i] = yp

get_mn_score(predicted, X_train_labels, N_total_pos=N_pos_train)

P :	0.747838118298
R :	0.558656330749
F1:	0.639550362372


In [201]:
from scipy import sparse
I = sparse.identity(X_dev.shape[1], format='csr')

predicted = -1 * np.ones(X_dev.shape[0])
for i in range(X_dev.shape[0]):
    yp = model.predict(X_dev.getrow(i), W=I, b=0.0)
    if yp is not None and yp > 0:
        predicted[i] = yp

get_mn_score(predicted, X_dev_labels, N_total_pos=N_pos_dev)

P :	0.676038737894
R :	0.544265593561
F1:	0.603037480842


In [336]:
len(cd.pos_terms)

63794

In [337]:
len(set(cd.pos_terms))

63794