In [1]:
from collections import defaultdict
from copy import deepcopy
import os
"""
saves exercises in format (num_exercises \n exercises \n labels \n, num_exercises \n exercises \n labels \n, ...) 
where a tuple of three rows are the exercises of one user 
"""
# total users in file: 1213
max_users = 1213

train_file = 'fr_en.slam.20171218.train'
test_data_file = 'fr_en.slam.20171218.dev'
test_labels_file = 'fr_en.slam.20171218.dev.key'

train_formatted_file = 'formatted_data_train_w.csv'
test_formatted_file = 'formatted_data_test_w.csv'

exercise_map = {}
exercise_i = 0
num_train_exercises, num_train_instances, num_test_exercises, num_test_instances = 0, 0, 0, 0
# reduced exercises for one-hotting
train_exercise_tuples, test_exercise_tuples = [], []
train_exercise_tuple, test_exercise_tuple = [0, [], []], [0, [], []]
train_label, train_exercise, test_label, test_exercise = '-1', '', '-1', ''
users = defaultdict(int)
included_users = set()
exercise_type = ''
cur_user = ''
prev_user = ''

print('formatting training exercises...')
with open(train_file, 'rt') as f:
    for line in f:
        line = line.strip()
        # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
        if len(line) == 0:
            if cur_user != prev_user and train_exercise_tuple[0] > 0:
                if train_exercise_tuple[0] < 500: 
                    train_exercise_tuples.append(deepcopy(train_exercise_tuple))
                    included_users.add(prev_user)
                train_exercise_tuple = [0, [], []]
            num_train_exercises += 1
            prev_user = cur_user
            if num_train_exercises % 100000 == 0:
                print('Loaded ' + str(num_train_instances) + ' instances across ' + str(num_train_exercises) + ' exercises...')

        # If the line starts with #, then we're beginning a new exercise
        elif line[0] == '#':
            cur_user = line.split()[1]
            if num_train_exercises == 0: prev_user = cur_user
            exercise_parameters = line[2:].split()
            # session is at index 4 and format at index 5
            exercise_type = exercise_parameters[5]
        else:
            users[cur_user] += 1
            line = line.split()
            train_label = int(line[-1])
            train_exercise = exercise_type + ','.join(line[2:5])
            # train_exercise = exercise_type + line[1] # using tokens causes OOM with over 5000 distinct exercises
            if train_exercise not in exercise_map:
                exercise_map[train_exercise] = exercise_i
                exercise_i += 1
            train_exercise_tuple[0] += 1
            train_exercise_tuple[1].append(str(exercise_map[train_exercise]))
            train_exercise_tuple[2].append(str(train_label))
            num_train_instances += 1

print('formatting done')
print('number of exercises: ', num_train_exercises)
print('total instances:', num_train_instances)
print('n users', len(users))
print('max n exercises by user', max(users.values()))
print('distinct reduced exercises:', len(exercise_map))
print('train data users:', len(train_exercise_tuples))

print('writing formatted training data...')
try: os.remove(train_formatted_file)
except OSError: pass
f = open(train_formatted_file, 'w')
for tup in train_exercise_tuples:
    f.write(str(tup[0]) + '\n')
    f.write(','.join(tup[1]) + '\n')
    f.write(','.join(tup[2]) + '\n')
print('data written to', train_formatted_file)

print('formatting test data...')
with open(test_data_file) as f1, open(test_labels_file) as f2:
    for line in f1:
        line = line.strip()
        # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
        if len(line) == 0:
            if cur_user != prev_user and test_exercise_tuple[0] > 0:
                if test_exercise_tuple[0] < 500 and prev_user in included_users:
                    test_exercise_tuples.append(deepcopy(test_exercise_tuple))
                test_exercise_tuple = [0, [], []]    
            num_test_exercises += 1
            prev_user = cur_user
            if num_test_exercises % 100000 == 0:
                print('Loaded ' + str(num_test_instances) + ' instances across '
                      + str(num_test_exercises) + ' exercises...')

        # If the line starts with #, then we're beginning a new exercise
        elif line[0] == '#':
            cur_user = line.split()[1]
            if num_test_exercises == 0: prev_user = cur_user
            exercise_parameters = line[2:].split()
            # session is at index 4 and format at index 5
            exercise_type = exercise_parameters[5]
        else:
            users[cur_user] += 1
            line = line.split()
            test_instance_label = f2.readline().split()
            assert line[0] == test_instance_label[0]
            test_label = test_instance_label[1]
            test_exercise = exercise_type + ','.join(line[2:5])
            # test_exercise = exercise_type + line[1] # using tokens causes OOM with over 5000 distinct exercises
            if test_exercise not in exercise_map:
                # exercise_map[test_exercise] = exercise_i
                # exercise_i += 1
                continue
            test_exercise_tuple[0] += 1
            test_exercise_tuple[1].append(str(exercise_map[test_exercise]))
            test_exercise_tuple[2].append(test_label)
            num_test_instances += 1

print('test data formatted')
print('test data users:', len(test_exercise_tuples))
print('writing test data...')
try: os.remove(test_formatted_file)
except OSError: pass
f = open(test_formatted_file, 'w')
for tup in test_exercise_tuples:
    f.write(str(tup[0]) + '\n')
    f.write(','.join(tup[1]) + '\n')
    f.write(','.join(tup[2]) + '\n')
print('test data written to', test_formatted_file)
print(list(exercise_map.keys())[:5])

formatting training exercises...
Loaded 285973 instances across 100000 exercises...
Loaded 567856 instances across 200000 exercises...
Loaded 850511 instances across 300000 exercises...
formatting done
number of exercises:  326792
total instances: 926657
n users 1213
max n exercises by user 7676
distinct reduced exercises: 1719
train data users: 481
writing formatted training data...
data written to formatted_data_train_w.csv
formatting test data...
test data formatted
test data users: 476
writing test data...
test data written to formatted_data_test_w.csv
['format:reverse_translateDET,Definite=Def|Gender=Masc|Number=Sing|fPOS=DET++,det', 'format:reverse_translateNOUN,Gender=Masc|Number=Sing|fPOS=NOUN++,ROOT', 'format:reverse_translatePRON,Number=Sing|Person=1|PronType=Prs|fPOS=PRON++,nsubj', 'format:reverse_translateVERB,Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|fPOS=VERB++,cop', 'format:reverse_translateDET,Definite=Ind|Gender=Fem|Number=Sing|PronType=Dem|fPOS=DET++,det']