## Process WikiCorp Dataset

In [1]:
import inflect,os,sys
from utils.sparse_utils import saveSparseHDF5
from utils.misc import savePickle
p = inflect.engine()

In [2]:
import numpy as np
from utils.sparse_utils import loadSparseHDF5

In [3]:
from load import loadDataset
dset1 = loadDataset('scws')
dset2 = loadDataset('wordsim353')
def wParse(k):
    return k.lower().replace('-','')
vocab_emb = set([wParse(dset1[k]['w1']) for k in dset1]+[wParse(dset1[k]['w2']) for k in dset1]+
            [wParse(dset2[k]['w1']) for k in dset2]+[wParse(dset2[k]['w2']) for k in dset2])
print len(vocab_emb),' words needed'

1728  words needed


In [4]:
#Load wikicorp vocab
assert os.path.exists('./wikicorp/WestburyLab.wikicorp.201004.feat'),'Feature file not found'
with open('./wikicorp/WestburyLab.wikicorp.201004.feat','r') as f:
    vocab = [k.strip().split(' ')[0] for k in f.readlines()]
print len(vocab)
vocab_arr = np.array(vocab)

2002178


In [5]:
#The number of singular nouns (flags->flag)
vlist  = []
w2idx  = {}

for idx,v in enumerate(vocab):
    sv = p.singular_noun(v)
    if sv:
        vlist.append(sv)
        w2idx[sv] = idx
    else:
        vlist.append(v)
        w2idx[v]  = idx
    
vocab_singular_only = set(vlist)
vocab_singular_list = np.array(vlist)
print vocab_singular_list.shape

(2002178,)


In [6]:
#Find all the words they map to
from_vocab = []
from_vocab_singular = []
absent     = []
mapIdToIdx = {}
for w in vocab_emb:
    if w in vocab:
        idxlst_vocab = np.where(w==vocab_arr)[0].tolist()
        assert len(idxlst_vocab)==1,'nsd'
        mapIdToIdx[w] = 'v_'+str(idxlst_vocab[0])
        from_vocab  += idxlst_vocab
    elif w in vocab_singular_only:
        idxlst = np.where(w==vocab_singular_list)[0].tolist()
        assert len(idxlst)==1,'nsd'
        from_vocab_singular+=idxlst
        mapIdToIdx[w] = 's_'+str(idxlst[0])
    else:
        print w, 'not found'
        absent.append(w)

sincere not found
y2k not found


In [7]:
print len(set(from_vocab))
print len(set(from_vocab_singular))
print len(set(from_vocab+from_vocab_singular))
idx_to_preserve = from_vocab+from_vocab_singular
print 'Preserving ',len(idx_to_preserve),' words from original vocab'

1685
41
1726
Preserving  1726  words from original vocab


In [8]:
data = loadSparseHDF5('dataset','./wikicorp/WestburyLab.wikicorp.201004.h5')
counts = np.array(data.sum(0)).squeeze().astype(int)
print counts.shape

(2002178,)


In [9]:
print 'See how frequent the words are: ',[counts[k] for k in idx_to_preserve]
print counts.shape

See how frequent the words are:  [134510, 8532, 5960, 25216, 21430, 77653, 89122, 3586, 7055, 2641, 709, 2844, 4144, 39408, 7604, 60102, 3123, 16880, 20582, 1098892, 28459, 186, 2019, 14728, 19096, 12402, 12261, 16900, 19883, 636, 260350, 1585, 179269, 130, 693372, 280182, 7872, 4843, 17934, 13380, 230, 2047, 209, 69426, 44931, 44785, 16110, 167, 159633, 47484, 15591, 12112, 15338, 3014, 5481, 18992, 29290, 1145, 97226, 18363, 128330, 750, 42214, 604425, 44959, 152258, 65368, 1184, 240337, 51960, 56446, 479, 80501, 3258, 20517, 2005, 73778, 115301, 611633, 24360, 4124, 242, 240635, 2483, 2498, 2719, 11381, 18910, 468070, 25041, 111156, 76147, 68646, 73687, 156833, 83327, 27332, 3673, 3153, 3526, 29239, 2703, 27031, 32546, 353363, 331148, 14626, 6877, 104185, 15731, 277, 2596, 46590, 352497, 52133, 43097, 26912, 247041, 15695, 5905, 77843, 120330, 253756, 2488, 24513, 47286, 64723, 82903, 89497, 142326, 14351, 24898, 20584, 103968, 22601, 2408, 624405, 8264, 36973, 35253, 18495, 509, 40

In [10]:
MAXVOCAB   = 20000
sorted_idx = list(set(np.argsort(counts)[-MAXVOCAB:].tolist() + idx_to_preserve))
print np.sort(counts[sorted_idx])

print  len(sorted_idx),np.max(sorted_idx),np.min(sorted_idx),len(vocab)
#Add vectors corresponding to embedding words

[      8      20      20 ..., 1166540 1237259 1939615]
20254 1999962 184 2002178


In [11]:
#Check absentees
aidx = []
for idx in idx_to_preserve: 
    if idx not in sorted_idx:
        aidx.append(idx)
print len(aidx),len(idx_to_preserve)

0 1726


In [12]:
#Number of words we're double counting ~ 4k should be OK
subset_w = [vocab[i] for i in sorted_idx]
dblct    = []
for w in subset_w:
    if p.singular_noun(w) in subset_w:
        dblct.append(w)
print len(dblct)

3492


In [13]:
data_subset = data.tocsc()[:,sorted_idx].tocsr()

In [14]:
features_subset = [vocab[k] for k in sorted_idx]

In [15]:
features_subset_singular = []
for w in features_subset:
    kk = p.singular_noun(w)
    if kk:
        features_subset_singular.append(kk)
    else:
        features_subset_singular.append(w)

In [16]:
features_subset          = np.array(features_subset)
features_subset_singular = np.array(features_subset_singular)

In [17]:
#Check that all the embeddings are availdble here (Except 3)
#Find all the words they map to
mapIdx = {}
for w in vocab_emb:
    if w in features_subset:
        idxlst_vocab = np.where(w==features_subset)[0].tolist()
        assert len(idxlst_vocab)==1,'nsd'
        mapIdx[w] = idxlst_vocab[0]
    elif w in features_subset_singular:
        idxlst = np.where(w==features_subset_singular)[0].tolist()
        assert len(idxlst)==1,'nsd'
        mapIdx[w] = idxlst[0]
    else:
        print w, 'not found'

sincere not found
y2k not found


In [20]:
#csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
print data_subset.max(),data_subset.shape
doccts = data_subset.max(1).toarray().squeeze()
docs_keep_idx = np.where(doccts>5)[0]
print docs_keep_idx.shape

data_subset_minlen = data_subset[docs_keep_idx]
print data_subset_minlen.shape

2702.0 (3035070, 20254)
(1213781,)
(1213781, 20254)


In [None]:
#Restrict the documents
np.random.seed(1)
shufidx = np.random.permutation(data_subset_minlen.shape[0])

test_idx    = shufidx[:10000]
valid_idx   = shufidx[10000:12000]
train_idx   = shufidx[12000:]

TRAIN = data_subset_minlen[train_idx]
VALID = data_subset_minlen[valid_idx]
TEST  = data_subset_minlen[test_idx]
print TRAIN.shape, VALID.shape, TEST.shape

In [None]:
cts_train = np.array(TRAIN.sum(0)).squeeze()
for k in mapIdx.values():
    if cts_train[k]<2:
        print features_subset[k],features_subset_singular[k],cts_train[k]

In [None]:
os.system('rm -rf ./wikicorp/data.h5 ./wikicorp/misc.pkl')
saveSparseHDF5(TRAIN, 'train', './wikicorp/data.h5')
saveSparseHDF5(VALID, 'valid', './wikicorp/data.h5')
saveSparseHDF5(TEST,  'test' , './wikicorp/data.h5')
savePickle([mapIdx,features_subset,features_subset_singular],'./wikicorp/misc.pkl')