## Process WikiCorp Dataset

In [None]:
import inflect,os,sys
from utils.sparse_utils import saveSparseHDF5
from utils.misc import savePickle
p = inflect.engine()

In [None]:
import numpy as np
from utils.sparse_utils import loadSparseHDF5

In [None]:
from load import loadDataset
dset1 = loadDataset('scws')
dset2 = loadDataset('wordsim353')
def wParse(k):
    return k.lower().replace('-','')
vocab_emb = set([wParse(dset1[k]['w1']) for k in dset1]+[wParse(dset1[k]['w2']) for k in dset1]+
            [wParse(dset2[k]['w1']) for k in dset2]+[wParse(dset2[k]['w2']) for k in dset2])
print len(vocab_emb),' words needed'

In [None]:
#Load wikicorp vocab
assert os.path.exists('./wikicorp/WestburyLab.wikicorp.201004.feat'),'Feature file not found'
with open('./wikicorp/WestburyLab.wikicorp.201004.feat','r') as f:
    vocab = [k.strip().split(' ')[0] for k in f.readlines()]
print len(vocab)
vocab_arr = np.array(vocab)

In [None]:
#The number of singular nouns (flags->flag)
vlist  = []
w2idx  = {}

for idx,v in enumerate(vocab):
    sv = p.singular_noun(v)
    if sv:
        vlist.append(sv)
        w2idx[sv] = idx
    else:
        vlist.append(v)
        w2idx[v]  = idx
    
vocab_singular_only = set(vlist)
vocab_singular_list = np.array(vlist)
print vocab_singular_list.shape

In [None]:
#Find all the words they map to
from_vocab = []
from_vocab_singular = []
absent     = []
mapIdToIdx = {}
for w in vocab_emb:
    if w in vocab:
        idxlst_vocab = np.where(w==vocab_arr)[0].tolist()
        assert len(idxlst_vocab)==1,'nsd'
        mapIdToIdx[w] = 'v_'+str(idxlst_vocab[0])
        from_vocab  += idxlst_vocab
    elif w in vocab_singular_only:
        idxlst = np.where(w==vocab_singular_list)[0].tolist()
        assert len(idxlst)==1,'nsd'
        from_vocab_singular+=idxlst
        mapIdToIdx[w] = 's_'+str(idxlst[0])
    else:
        print w, 'not found'
        absent.append(w)

In [None]:
print len(set(from_vocab))
print len(set(from_vocab_singular))
print len(set(from_vocab+from_vocab_singular))
idx_to_preserve = from_vocab+from_vocab_singular
print 'Preserving ',len(idx_to_preserve),' words from original vocab'

In [None]:
data = loadSparseHDF5('dataset','./wikicorp/WestburyLab.wikicorp.201004.h5')
counts = np.array(data.sum(0)).squeeze().astype(int)
print counts.shape

In [None]:
print 'See how frequent the words are: ',[counts[k] for k in idx_to_preserve]
print counts.shape

In [None]:
MAXVOCAB   = 20000
sorted_idx = list(set(np.argsort(counts)[-MAXVOCAB:].tolist() + idx_to_preserve))
print np.sort(counts[sorted_idx])

print  len(sorted_idx),np.max(sorted_idx),np.min(sorted_idx),len(vocab)
#Add vectors corresponding to embedding words

In [None]:
#Check absentees
aidx = []
for idx in idx_to_preserve: 
    if idx not in sorted_idx:
        aidx.append(idx)
print len(aidx),len(idx_to_preserve)

In [None]:
#Number of words we're double counting ~ 4k should be OK
subset_w = [vocab[i] for i in sorted_idx]
dblct    = []
for w in subset_w:
    if p.singular_noun(w) in subset_w:
        dblct.append(w)
print len(dblct)

In [None]:
data_subset = data.tocsc()[:,sorted_idx].tocsr()

In [None]:
features_subset = [vocab[k] for k in sorted_idx]

In [None]:
features_subset_singular = []
for w in features_subset:
    kk = p.singular_noun(w)
    if kk:
        features_subset_singular.append(kk)
    else:
        features_subset_singular.append(w)

In [None]:
features_subset          = np.array(features_subset)
features_subset_singular = np.array(features_subset_singular)

In [None]:
#Check that all the embeddings are availdble here (Except 3)
#Find all the words they map to
mapIdx = {}
for w in vocab_emb:
    if w in features_subset:
        idxlst_vocab = np.where(w==features_subset)[0].tolist()
        assert len(idxlst_vocab)==1,'nsd'
        mapIdx[w] = idxlst_vocab[0]
    elif w in features_subset_singular:
        idxlst = np.where(w==features_subset_singular)[0].tolist()
        assert len(idxlst)==1,'nsd'
        mapIdx[w] = idxlst[0]
    else:
        print w, 'not found'

In [None]:
#csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
print data_subset.max(),data_subset.shape
doccts = data_subset.max(1).toarray().squeeze()
docs_keep_idx = np.where(doccts>5)[0]
print docs_keep_idx.shape

data_subset_minlen = data_subset[docs_keep_idx]
print data_subset_minlen.shape

In [None]:
np.sort(np.array(data_subset_minlen.sum(1)).squeeze())

In [None]:
#Restrict the documents
np.random.seed(1)
shufidx = np.random.permutation(data_subset_minlen.shape[0])

test_idx    = shufidx[:10000]
valid_idx   = shufidx[10000:12000]
train_idx   = shufidx[12000:]

TRAIN = data_subset_minlen[train_idx]
VALID = data_subset_minlen[valid_idx]
TEST  = data_subset_minlen[test_idx]
print TRAIN.shape, VALID.shape, TEST.shape
print np.sort(np.array(TRAIN.sum(1)).squeeze()).astype(int), np.sort(np.array(VALID.sum(1)).squeeze()), np.sort(np.array(TEST.sum(1)).squeeze())

In [None]:
cts_train = np.array(TRAIN.sum(0)).squeeze()
for k in mapIdx.values():
    if cts_train[k]<2:
        print features_subset[k],features_subset_singular[k],cts_train[k]

In [None]:
os.system('rm -rf ./wikicorp/data.h5 ./wikicorp/misc.pkl')
saveSparseHDF5(TRAIN, 'train', './wikicorp/data.h5')
saveSparseHDF5(VALID, 'valid', './wikicorp/data.h5')
saveSparseHDF5(TEST,  'test' , './wikicorp/data.h5')
savePickle([mapIdx,features_subset,features_subset_singular],'./wikicorp/misc.pkl')