### Load feature assignment

In [1]:
import pickle
## lemma assignment
with open("clusterAssignmen100.pk", "rb") as f:
    lemma_assignment = pickle.load(f)

category = ['Alt', 'Center', 'Left', 'Right']
## tag, entity assignment
tag = set()
entity = set()
for cat in category:
    with open(cat + "entity.pk", "rb") as f:
        entity.update(pickle.load(f))
    with open(cat + "tag.pk", "rb") as f:
        tag.update(pickle.load(f))
tag = list(tag)
entity = list(entity)
tag_assignment = {tag[i]:i for i in range(len(tag))}
entity_assignment = {entity[i]:i for i in range(len(entity))}

In [2]:
len(tag_assignment),len(entity_assignment), len(lemma_assignment)

(46, 18, 155407)

In [3]:
import numpy as np
from time import time
# num of features = 46 + 18 + 100 + 1 = 165 (46 tags, 18 entities, 100 lemma category, 1 sentence length(option))
n_feats = 164
def vectorizeFeatures(data):
    """
    data: list of dict{feature: number of feature}
    return np.array(n_data, n_feats)
    """
    start = time()
    n_data = len(data)
    feats = np.zeros((n_data, n_feats))
    valid_list = []
    for i in range(n_data):
        datum = data[i]
        sentence_len = datum["LENGTH"]
        if sentence_len == 0:
            continue
        for key in datum:
            if key in tag_assignment:
                feats[i][tag_assignment[key]] = datum[key]/sentence_len
            elif key in entity_assignment:
                feats[i][entity_assignment[key] + 46] = datum[key]/sentence_len
            elif key in lemma_assignment:
                feats[i][lemma_assignment[key] + 64] += datum[key]/sentence_len
        valid_list.append(i)
    print(time()-start)
    return feats[valid_list]

### Too many data (2 million) store a small equal sized data for each category

In [4]:
def smallData(data, category, size):
    np.random.shuffle(data)
    np.save(category + "npSmall", data[:size])

### vectorize data

In [5]:
small_size = 100000
for cat in category:
    with open(cat + "feats.pk", "rb") as f:
        data = pickle.load(f)
    print("Loaded data: ", cat)
    feats = vectorizeFeatures(data)
    print("Vectorized data: ", cat)
    np.save(cat + "npLarge", feats)
    print("Stored np large: ", cat)
    smallData(feats, cat, small_size)
    print("Stored np small", cat)

Loaded data:  Alt
7.888857841491699
Vectorized data:  Alt
Stored np large:  Alt
Stored np small Alt
Loaded data:  Center
23.311243057250977
Vectorized data:  Center
Stored np large:  Center
Stored np small Center
Loaded data:  Left
26.818711042404175
Vectorized data:  Left
Stored np large:  Left
Stored np small Left
Loaded data:  Right
18.181073904037476
Vectorized data:  Right
Stored np large:  Right
Stored np small Right
