In [1]:
%matplotlib inline
import matplotlib as mpl;
import matplotlib.pyplot as plt;

In [2]:
import numpy as np;
import gzip;
import StringIO;

def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO.StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

'''
Read the data (precomputed sensor-features and labels) for a user.
This function assumes the user's data file is present.
'''
def read_user_data(uuid):
    user_data_file = 'ExtraSensory.per_uuid_features_labels/%s.features_labels.csv.gz' % uuid;

    # Read the entire csv file of the user:
    with gzip.open(user_data_file,'rb') as fid:
        csv_str = fid.read();
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [3]:
def get_label_pretty_name(label):
    if label == 'FIX_walking':
        return 'Walking';
    if label == 'FIX_running':
        return 'Running';
    if label == 'LOC_main_workplace':
        return 'At main workplace';
    if label == 'OR_indoors':
        return 'Indoors';
    if label == 'OR_outside':
        return 'Outside';
    if label == 'LOC_home':
        return 'At home';
    if label == 'FIX_restaurant':
        return 'At a restaurant';
    if label == 'OR_exercise':
        return 'Exercise';
    if label == 'LOC_beach':
        return 'At the beach';
    if label == 'OR_standing':
        return 'Standing';
    if label == 'WATCHING_TV':
        return 'Watching TV'
    
    if label.endswith('_'):
        label = label[:-1] + ')';
        pass;
    
    label = label.replace('__',' (').replace('_',' ');
    label = label[0] + label[1:].lower();
    label = label.replace('i m','I\'m');
    return label;

In [4]:
import glob
uuid_list = glob.glob("ExtraSensory.per_uuid_features_labels/*.gz")
for (uuid, label) in enumerate(uuid_list):
    uuid_list[uuid] = label.replace('ExtraSensory.per_uuid_features_labels/','');
    pass;

for (uuid, label) in enumerate(uuid_list):
    uuid_list[uuid] = label.replace('.features_labels.csv.gz','');
    pass;
print uuid_list


['A7599A50-24AE-46A6-8EA6-2576F1011D81', '78A91A4E-4A51-4065-BDA7-94755F0BB3BB', '11B5EC4D-4133-4289-B475-4E737182A406', 'CDA3BBF7-6631-45E8-85BA-EEB416B32A3C', 'CF722AA9-2533-4E51-9FEB-9EAC84EE9AAC', '5152A2DF-FAF3-4BA8-9CA9-E66B32671A53', 'A76A5AF5-5A93-4CF2-A16E-62353BB70E8A', '665514DE-49DC-421F-8DCB-145D0B2609AD', '83CF687B-7CEC-434B-9FE8-00C3D5799BE6', '27E04243-B138-4F40-A164-F40B60165CF3', '1538C99F-BA1E-4EFB-A949-6C7C47701B20', '5EF64122-B513-46AE-BCF1-E62AAC285D2C', '7D9BB102-A612-4E2A-8E22-3159752F55D8', '33A85C34-CFE4-4732-9E73-0A7AC861B27A', '74B86067-5D4B-43CF-82CF-341B76BEA0F4', '3600D531-0C55-44A7-AE95-A7A38519464E', '9DC38D04-E82E-4F29-AB52-B476535226F2', '5119D0F8-FCA8-4184-A4EB-19421A40DE0D', '7CE37510-56D0-4120-A1CF-0E23351428D2', '1DBB0F6F-1F81-4A50-9DF4-CD62ACFA4842', '1155FF54-63D3-4AB2-9863-8385D0BD0A13', '61359772-D8D8-480D-B623-7C636EAD0C81', '24E40C4C-A349-4F9F-93AB-01D00FB994AF', 'B9724848-C7E2-45F4-9B3F-A1F38D864495', '0A986513-7828-4D53-AA1F-E02D6DF9561B',

In [5]:
import json
import os
import errno

count = 0

for uuid in uuid_list:
    (X,Y,M,timestamps,feature_names,label_names) = read_user_data(uuid);

    modified_labels = []
    for i in range(len(label_names)):
        modified_labels.append(get_label_pretty_name(label_names[i]))

    for i in range(len(timestamps)):
        dirname  = "extrasensory.labels." + str(uuid[0:8])
        filename = str(timestamps[i]) + ".server_predictions.json"
        fullname = "60TESTERS/" + dirname + "/" + filename

        if not os.path.exists(os.path.dirname(fullname)):
            try:
                os.makedirs(os.path.dirname(fullname))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        data = {}
        data['label_names'] = modified_labels

        probabilities = []
        for ii in range(len(label_names)):
            bools = Y[0, ii]
            if bools == True:
                probabilities.append(1.0)
            else:
                probabilities.append(0.0)
        data['label_probs'] = probabilities

        with open(fullname, "w+") as f:
            json.dump(data, f)
    count = count + 1
    print count
    print "finished!" + fullname 

1
finished!60TESTERS/extrasensory.labels.A7599A50/1447307917.server_predictions.json
2
finished!60TESTERS/extrasensory.labels.78A91A4E/1449809384.server_predictions.json
3
finished!60TESTERS/extrasensory.labels.11B5EC4D/1440612571.server_predictions.json
4
finished!60TESTERS/extrasensory.labels.CDA3BBF7/1442078275.server_predictions.json
5
finished!60TESTERS/extrasensory.labels.CF722AA9/1448236775.server_predictions.json
6
finished!60TESTERS/extrasensory.labels.5152A2DF/1443647130.server_predictions.json
7
finished!60TESTERS/extrasensory.labels.A76A5AF5/1443557065.server_predictions.json
8
finished!60TESTERS/extrasensory.labels.665514DE/1449782489.server_predictions.json
9
finished!60TESTERS/extrasensory.labels.83CF687B/1450117058.server_predictions.json
10
finished!60TESTERS/extrasensory.labels.27E04243/1448996921.server_predictions.json
11
finished!60TESTERS/extrasensory.labels.1538C99F/1445982789.server_predictions.json
12
finished!60TESTERS/extrasensory.labels.5EF64122/1442671439.s