In [1]:
from collections import defaultdict
from copy import deepcopy
import os
import numpy as np
"""
saves exercises in format (num_exercises \n exercises \n labels \n, num_exercises \n exercises \n labels \n, ...) 
where a tuple of three rows are the exercises of one user 
"""
# total users in file: 1213
max_users = 2000
max_exercises = 800
min_exercises = 0
train_file = 'fr_en.slam.20171218.train'
test_data_file = 'fr_en.slam.20171218.dev'
test_labels_file = 'fr_en.slam.20171218.dev.key'

train_formatted_file = 'formatted_data_train_.csv'
test_formatted_file = 'formatted_data_test_.csv'

exercise_map = {}
exercise_i = 0
num_train_exercises, num_train_instances, num_test_exercises, num_test_instances = 0, 0, 0, 0
# reduced exercises for one-hotting
train_exercise_tuples, test_exercise_tuples = [], []
train_exercise_tuple, test_exercise_tuple = [0, [], []], [0, [], []]
train_label, train_exercise, test_label, test_exercise = '-1', '', '-1', ''
users = defaultdict(int)
included_users = set()
exercise_type = ''
cur_user = ''
prev_user = ''
prev_pos = ''
word_counts = defaultdict(int)
print('formatting training exercises...')

def word_len(word):
    n = len(word)
    if n < 4: return 'short'
    if n < 7: return 'medium'
    if n < 10: return 'med_long'
    else: return 'long'

    # Compute word counts
with open(train_file, 'rt') as f:
    for line in f:
        line = line.strip()
        # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
        if len(line) == 0:
            continue
        # If the line starts with #, then we're beginning a new exercise
        elif line[0] == '#':
            continue
        else:
            line = line.split()
            word_counts[line[1]] += 1

max_word_count = np.max(list(word_counts.values()))
freqs = defaultdict(int)
def word_freq(token):
    frequency = word_counts[token.lower()] / max_word_count
    if frequency < .001: freqs['<.001'] += 1; return 'very_rare'
    if frequency < .005: freqs['<.005'] += 1; return 'rare'
    if frequency < .01: freqs['<.01'] += 1; return 'medium_rare'
    if frequency < .05: freqs['<.05'] += 1; return 'semi_rare'
    if frequency < .1: freqs['<.1'] += 1;return 'quite_common'
    if frequency < .5: freqs['<.5'] += 1; return 'common'
    else: freqs['>=.5'] += 1; return 'very_common'


with open(train_file, 'rt') as f:
    for line in f:
        line = line.strip()
        # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
        n_ex = train_exercise_tuple[0]
        if len(line) == 0:
            if n_ex > max_exercises or (cur_user != prev_user and n_ex > 0):
                if n_ex > min_exercises: 
                    train_exercise_tuples.append(deepcopy(train_exercise_tuple))
                    included_users.add(prev_user)
                train_exercise_tuple = [0, [], []]
            num_train_exercises += 1
            prev_user = cur_user
            if num_train_exercises % 100000 == 0:
                print('Loaded ' + str(num_train_instances) + ' instances across ' + str(num_train_exercises) + ' exercises...')

        # If the line starts with #, then we're beginning a new exercise
        elif line[0] == '#':
            cur_user = line.split()[1]
            if num_train_exercises == 0: prev_user = cur_user
            exercise_parameters = line[2:].split()
            # session is at index 4 and format at index 5
            #exercise_type = exercise_parameters[4] + ', ' + exercise_parameters[5]
            exercise_type = exercise_parameters[5]
            prev_pos = ''
        else:
            users[cur_user] += 1
            line = line.split()
            train_label = int(line[-1])
            #train_exercise = exercise_type + ','.join(line[2:3])
            #train_exercise = exercise_type + line[1] + prev_pos 
            train_exercise = exercise_type + ','.join(line[2:4]) + prev_pos 
            #train_exercise = exercise_type + ','.join(line[2:4]) + prev_pos + word_freq(line[1]) + word_len(line[1])
            #train_exercise = exercise_type + ','.join(line[2:4]) + word_freq(line[1]) + word_len(line[1])
            prev_pos = line[2]
            if train_exercise not in exercise_map:
                exercise_map[train_exercise] = exercise_i
                exercise_i += 1
            train_exercise_tuple[0] += 1
            train_exercise_tuple[1].append(str(exercise_map[train_exercise]))
            train_exercise_tuple[2].append(str(train_label))
            num_train_instances += 1

print('formatting done')
print('number of exercises: ', num_train_exercises)
print('total instances:', num_train_instances)
print('n users', len(users))
print('max n exercises by user', max(users.values()))
print('distinct reduced exercises:', len(exercise_map))
print('train data users:', len(train_exercise_tuples))

print('writing formatted training data...')
try: os.remove(train_formatted_file)
except OSError: pass
f = open(train_formatted_file, 'w')
for tup in train_exercise_tuples[:max_users]:
    f.write(str(tup[0]) + '\n')
    f.write(','.join(tup[1]) + '\n')
    f.write(','.join(tup[2]) + '\n')
print('data written to', train_formatted_file)

print('formatting test data...')
with open(test_data_file) as f1, open(test_labels_file) as f2:
    for line in f1:
        line = line.strip()
        # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
        if len(line) == 0:
            if prev_user != cur_user:
                test_exercise_tuples.append(deepcopy(test_exercise_tuple))
                test_exercise_tuple = [0, [], []]    
            num_test_exercises += 1
            prev_user = cur_user
            if num_test_exercises % 100000 == 0:
                print('Loaded ' + str(num_test_instances) + ' instances across '
                      + str(num_test_exercises) + ' exercises...')

        # If the line starts with #, then we're beginning a new exercise
        elif line[0] == '#':
            cur_user = line.split()[1]
            if num_test_exercises == 0: prev_user = cur_user
            exercise_parameters = line[2:].split()
            # session is at index 4 and format at index 5
            #exercise_type = exercise_parameters[4] + ', ' + exercise_parameters[5]
            exercise_type = exercise_parameters[5]
            prev_pos=''
        else:
            users[cur_user] += 1
            line = line.split()
            test_instance_label = f2.readline().split()
            assert line[0] == test_instance_label[0]
            test_label = test_instance_label[1]
            test_exercise = exercise_type + ','.join(line[2:4]) + prev_pos # morph features
            #test_exercise = exercise_type + line[1] + prev_pos # word tokens
            #test_exercise = exercise_type + ','.join(line[2:4]) + prev_pos + word_freq(line[1]) + word_len(line[1])
            #test_exercise = exercise_type + ','.join(line[2:4]) + word_freq(line[1]) + word_len(line[1])
            prev_pos = line[2]
            if test_exercise not in exercise_map:
                exercise_map[test_exercise] = exercise_i
                exercise_i += 1
                #continue
            test_exercise_tuple[0] += 1
            test_exercise_tuple[1].append(str(exercise_map[test_exercise]))
            test_exercise_tuple[2].append(test_label)
            num_test_instances += 1

print('test data formatted')
print('test data users:', len(test_exercise_tuples))
print('writing test data...')
try: os.remove(test_formatted_file)
except OSError: pass
f = open(test_formatted_file, 'w')
for tup in test_exercise_tuples[:max_users]:
    f.write(str(tup[0]) + '\n')
    f.write(','.join(tup[1]) + '\n')
    f.write(','.join(tup[2]) + '\n')
print('test data written to', test_formatted_file)
print(list(exercise_map.keys())[:5])
print('distinct reduced exercises:', len(exercise_map))
#print(word_counts)

formatting training exercises...
Loaded 285973 instances across 100000 exercises...
Loaded 567856 instances across 200000 exercises...
Loaded 850511 instances across 300000 exercises...
formatting done
number of exercises:  326792
total instances: 926657
n users 1213
max n exercises by user 7676
distinct reduced exercises: 2131
train data users: 1720
writing formatted training data...
data written to formatted_data_train_.csv
formatting test data...
test data formatted
test data users: 1205
writing test data...
test data written to formatted_data_test_.csv
['format:reverse_translateDET,Definite=Def|Gender=Masc|Number=Sing|fPOS=DET++', 'format:reverse_translateNOUN,Gender=Masc|Number=Sing|fPOS=NOUN++DET', 'format:reverse_translatePRON,Number=Sing|Person=1|PronType=Prs|fPOS=PRON++', 'format:reverse_translateVERB,Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|fPOS=VERB++PRON', 'format:reverse_translateDET,Definite=Ind|Gender=Fem|Number=Sing|PronType=Dem|fPOS=DET++VERB']
distinct re