In [1]:
import numpy as np
import pandas as pd
import joblib
from scipy.sparse import hstack, coo_matrix, csr_matrix

In [2]:
data = pd.read_csv("algebra_2008_2009_train_trimmed3_remove_duplicates_sorted.csv", delimiter='\t')

In [3]:
data.describe(include='all')

Unnamed: 0,Anon Student Id,Problem Name,Problem View,Step Name,First Transaction Time,Correct First Attempt,Incorrects,Hints,Corrects,KC(SubSkills),Opportunity(SubSkills),KC(KTracedSkills),Opportunity(KTracedSkills),KC(Rules),Opportunity(Rules),Unit,Sections
count,8483920,8483920,8483920.0,8483920,8483920,8483920.0,8483920.0,8483920.0,8483920.0,6204248,6204248.0,4296385,4296385.0,8201432,8201432.0,8483920,8483920
unique,3310,188368,,700635,823557,,,,,1815,910506.0,921,157173.0,2780,10780.0,42,165
top,stu_ea97537407,L5FB16,,FinalAnswer,2009-04-27 12:55:20.0,,,,,Bogus skill,1.0,Entering a given-1,1.0,DISTANCE-MIDPOINT-PREFILLED-BOGUS-HINT,1.0,Unit CTA1_02,Section CTA1_02-2
freq,15608,52755,,255762,109,,,,,408686,229924.0,440356,243040.0,408686,436844.0,715873,527203
mean,,,1.091596,,,0.8595012,0.3432128,0.08589826,1.008225,,,,,,,,
std,,,0.3611967,,,0.3475039,3.904958,0.4622137,0.3963328,,,,,,,,
min,,,1.0,,,0.0,0.0,0.0,0.0,,,,,,,,
25%,,,1.0,,,1.0,0.0,0.0,1.0,,,,,,,,
50%,,,1.0,,,1.0,0.0,0.0,1.0,,,,,,,,
75%,,,1.0,,,1.0,0.0,0.0,1.0,,,,,,,,


In [4]:
# Build a dict of all ID-KC pairs
from collections import deque


id_kc_to_hint = {}
for index, (stu_id, subskill, traced, rule) in enumerate(zip(data['Anon Student Id'],
                                                             data['KC(SubSkills)'], 
                                                             data['KC(KTracedSkills)'], 
                                                             data['KC(Rules)'])):
    
    if subskill is not np.nan:
        for kc in subskill.split('~~'):
            id_kc = "".join([stu_id, kc])
            if id_kc not in id_kc_to_hint:
                id_kc_to_hint[id_kc] = deque([])
    
    if traced is not np.nan:
        for kc in traced.split('~~'):
            id_kc = "".join([stu_id, kc])
            if id_kc not in id_kc_to_hint:
                id_kc_to_hint[id_kc] = deque([])
    
    if rule is not np.nan:
        for kc in rule.split('~~'):
            id_kc = "".join([stu_id, kc])
            if id_kc not in id_kc_to_hint:
                id_kc_to_hint[id_kc] = deque([])
    
    if index%500000 == 0:
        print index

0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000


In [5]:
# Build a dictionary mapping of id-unit pair -> [rows in testing data that hold the id-unit pair]
id_unit_dict = {}


for index, (stu_id, unit) in enumerate(zip(data['Anon Student Id'], data['Unit'])):
    id_unit = "".join([stu_id, unit])
    if id_unit not in id_unit_dict:
        id_unit_dict[id_unit] = []    
    if index%500000 == 0:
        print index

0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000


In [6]:
# Get and process test data
testdata = pd.read_csv('algebra_2008_2009_test.txt', delimiter='\t')
split_object = testdata['Problem Hierarchy'].str.split(', ')
testdata['Unit'] = split_object.str[0]
testdata['Sections'] = split_object.str[1]
del testdata['Problem Hierarchy']

In [7]:
# Fill in the values of id_unit_dict with the rows in test that contain the id-unit pair
for index, (stu_id, unit) in enumerate(zip(testdata['Anon Student Id'], testdata['Unit'])):
    id_unit = "".join([stu_id, unit])
    id_unit_dict[id_unit].append(index)

In [8]:
# These are the features
training_prev_hints = []  # Average hints asked for the last '6' steps with the same KC components
training_prev_hints_available = []  # Boolean whether the above information is available or non-existent
testing_prev_hints = [None] * testdata['Anon Student Id'].size
testing_prev_hints_available = [None] * testdata['Anon Student Id'].size


In [9]:
# CREATE THE FEATURES FOR TRAINING AND TESTING IN ONE GO!!




no_of_previous_entries_to_use = 6


previous_idunit = None
for index, (stu_id, unit, subskill, traced, rule, hint) in enumerate(zip(data['Anon Student Id'], data['Unit'],
                                                                   data['KC(SubSkills)'], 
                                                                   data['KC(KTracedSkills)'], 
                                                                   data['KC(Rules)'], data['Hints'])):
    
    
    current_idunit = "".join([stu_id, unit])
    
    ## do updating of testdata here
    if previous_idunit is None:
        pass
    
    else:
        if previous_idunit != current_idunit:
            for idx in id_unit_dict[previous_idunit]:
                
                to_average = []
                if testdata['KC(SubSkills)'][idx] is not np.nan:
                    for kc in testdata['KC(SubSkills)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                
                if testdata['KC(KTracedSkills)'][idx] is not np.nan:
                    for kc in testdata['KC(KTracedSkills)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                        
                if testdata['KC(Rules)'][idx] is not np.nan:
                    for kc in testdata['KC(Rules)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                
                if not to_average:
                    testing_prev_hints_available[idx] = 0
                    testing_prev_hints[idx] = 0
                else:
                    testing_prev_hints[idx] = np.mean(to_average)
                    testing_prev_hints_available[idx] = 1
                        
    previous_idunit = current_idunit
    
    
    
    
    to_average = []
    if subskill is not np.nan:
        for kc in subskill.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(hint)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if traced is not np.nan:
        for kc in traced.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(hint)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if rule is not np.nan:
        for kc in rule.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(hint)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if not to_average:
        training_prev_hints_available.append(0)
        training_prev_hints.append(0)
    else:
        training_prev_hints.append(np.mean(to_average))
        training_prev_hints_available.append(1)
    
    if index%500000 == 0:
        print index
    
    

for idx in id_unit_dict[previous_idunit]:
                
    to_average = []
    if testdata['KC(SubSkills)'][idx] is not np.nan:
        for kc in testdata['KC(SubSkills)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if testdata['KC(KTracedSkills)'][idx] is not np.nan:
        for kc in testdata['KC(KTracedSkills)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if testdata['KC(Rules)'][idx] is not np.nan:
        for kc in testdata['KC(Rules)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if not to_average:
        testing_prev_hints_available[idx] = 0
        testing_prev_hints[idx] = 0
    else:
        testing_prev_hints[idx] = np.mean(to_average)
        testing_prev_hints_available[idx] = 1


0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000


### Now do the same for incorrects

In [10]:
# Reset the id_to_kc_to_hint dictionary deques
for key in id_kc_to_hint:
    id_kc_to_hint[key] = deque([])

In [11]:
# These are the features
training_prev_incs = []  # Average hints asked for the last '6' steps with the same KC components
training_prev_incs_available = []  # Boolean whether the above information is available or non-existent
testing_prev_incs = [None] * testdata['Anon Student Id'].size
testing_prev_incs_available = [None] * testdata['Anon Student Id'].size


In [12]:
# CREATE THE FEATURES FOR TRAINING AND TESTING IN ONE GO!!




no_of_previous_entries_to_use = 6


previous_idunit = None
for index, (stu_id, unit, subskill, traced, rule, inc) in enumerate(zip(data['Anon Student Id'], data['Unit'],
                                                                   data['KC(SubSkills)'], 
                                                                   data['KC(KTracedSkills)'], 
                                                                   data['KC(Rules)'], data['Incorrects'])):
    
    
    current_idunit = "".join([stu_id, unit])
    
    ## do updating of testdata here
    if previous_idunit is None:
        pass
    
    else:
        if previous_idunit != current_idunit:
            for idx in id_unit_dict[previous_idunit]:
                
                to_average = []
                if testdata['KC(SubSkills)'][idx] is not np.nan:
                    for kc in testdata['KC(SubSkills)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                
                if testdata['KC(KTracedSkills)'][idx] is not np.nan:
                    for kc in testdata['KC(KTracedSkills)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                        
                if testdata['KC(Rules)'][idx] is not np.nan:
                    for kc in testdata['KC(Rules)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                
                if not to_average:
                    testing_prev_incs_available[idx] = 0
                    testing_prev_incs[idx] = 0
                else:
                    testing_prev_incs[idx] = np.mean(to_average)
                    testing_prev_incs_available[idx] = 1
                        
    previous_idunit = current_idunit
    
    
    
    
    to_average = []
    if subskill is not np.nan:
        for kc in subskill.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(inc)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if traced is not np.nan:
        for kc in traced.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(inc)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if rule is not np.nan:
        for kc in rule.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(inc)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if not to_average:
        training_prev_incs_available.append(0)
        training_prev_incs.append(0)
    else:
        training_prev_incs.append(np.mean(to_average))
        training_prev_incs_available.append(1)
    
    if index%500000 == 0:
        print index
    
    

for idx in id_unit_dict[previous_idunit]:
                
    to_average = []
    if testdata['KC(SubSkills)'][idx] is not np.nan:
        for kc in testdata['KC(SubSkills)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if testdata['KC(KTracedSkills)'][idx] is not np.nan:
        for kc in testdata['KC(KTracedSkills)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if testdata['KC(Rules)'][idx] is not np.nan:
        for kc in testdata['KC(Rules)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if not to_average:
        testing_prev_incs_available[idx] = 0
        testing_prev_incs[idx] = 0
    else:
        testing_prev_incs[idx] = np.mean(to_average)
        testing_prev_incs_available[idx] = 1


0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000


### Do it for CFAs

In [13]:
# Reset the id_to_kc_to_hint dictionary deques
for key in id_kc_to_hint:
    id_kc_to_hint[key] = deque([])

In [14]:
# These are the features
training_prev_cfa = []  # Average hints asked for the last '6' steps with the same KC components
training_prev_cfa_available = []  # Boolean whether the above information is available or non-existent
testing_prev_cfa = [None] * testdata['Anon Student Id'].size
testing_prev_cfa_available = [None] * testdata['Anon Student Id'].size


In [15]:
# CREATE THE FEATURES FOR TRAINING AND TESTING IN ONE GO!!




no_of_previous_entries_to_use = 6


previous_idunit = None
for index, (stu_id, unit, subskill, traced, rule, cfa) in enumerate(zip(data['Anon Student Id'], data['Unit'],
                                                                   data['KC(SubSkills)'], 
                                                                   data['KC(KTracedSkills)'], 
                                                                   data['KC(Rules)'], data['Correct First Attempt'])):
    
    
    current_idunit = "".join([stu_id, unit])
    
    ## do updating of testdata here
    if previous_idunit is None:
        pass
    
    else:
        if previous_idunit != current_idunit:
            for idx in id_unit_dict[previous_idunit]:
                
                to_average = []
                if testdata['KC(SubSkills)'][idx] is not np.nan:
                    for kc in testdata['KC(SubSkills)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                
                if testdata['KC(KTracedSkills)'][idx] is not np.nan:
                    for kc in testdata['KC(KTracedSkills)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                        
                if testdata['KC(Rules)'][idx] is not np.nan:
                    for kc in testdata['KC(Rules)'][idx].split('~~'):
                        id_kc = "".join([testdata['Anon Student Id'][idx], kc])
                        try:
                            to_average.extend(id_kc_to_hint[id_kc])
                        except KeyError:
                            pass
                
                if not to_average:
                    testing_prev_cfa_available[idx] = 0
                    testing_prev_cfa[idx] = 0
                else:
                    testing_prev_cfa[idx] = np.mean(to_average)
                    testing_prev_cfa_available[idx] = 1
                        
    previous_idunit = current_idunit
    
    
    
    
    to_average = []
    if subskill is not np.nan:
        for kc in subskill.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(cfa)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if traced is not np.nan:
        for kc in traced.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(cfa)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if rule is not np.nan:
        for kc in rule.split('~~'):
            id_kc = "".join([stu_id, kc])
            to_average.extend(id_kc_to_hint[id_kc])
            id_kc_to_hint[id_kc].append(cfa)
            if len(id_kc_to_hint[id_kc]) > no_of_previous_entries_to_use:
                id_kc_to_hint[id_kc].popleft()
    
    if not to_average:
        training_prev_cfa_available.append(0)
        training_prev_cfa.append(0)
    else:
        training_prev_cfa.append(np.mean(to_average))
        training_prev_cfa_available.append(1)
    
    if index%500000 == 0:
        print index
    
    

for idx in id_unit_dict[previous_idunit]:
                
    to_average = []
    if testdata['KC(SubSkills)'][idx] is not np.nan:
        for kc in testdata['KC(SubSkills)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if testdata['KC(KTracedSkills)'][idx] is not np.nan:
        for kc in testdata['KC(KTracedSkills)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if testdata['KC(Rules)'][idx] is not np.nan:
        for kc in testdata['KC(Rules)'][idx].split('~~'):
            id_kc = "".join([testdata['Anon Student Id'][idx], kc])
            try:
                to_average.extend(id_kc_to_hint[id_kc])
            except KeyError:
                pass

    if not to_average:
        testing_prev_cfa_available[idx] = 0
        testing_prev_cfa[idx] = 0
    else:
        testing_prev_cfa[idx] = np.mean(to_average)
        testing_prev_cfa_available[idx] = 1


0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000


## Convert to file

In [16]:
train_temporal_features = np.array([training_prev_hints, training_prev_hints_available, 
                                    training_prev_incs, training_prev_incs_available, 
                                    training_prev_cfa, training_prev_cfa_available]).T
train_temporal_features

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       ..., 
       [ 0.        ,  1.        ,  0.08333333,  1.        ,  1.        ,
         1.        ],
       [ 0.08333333,  1.        ,  0.91666667,  1.        ,  0.91666667,
         1.        ],
       [ 0.        ,  1.        ,  0.        ,  1.        ,  1.        ,
         1.        ]])

In [19]:
test_temporal_features = np.array([testing_prev_hints, testing_prev_hints_available, 
                                   testing_prev_incs, testing_prev_incs_available, 
                                   testing_prev_cfa, testing_prev_cfa_available])
test_temporal_features = test_temporal_features.T

In [17]:
import joblib

In [18]:
joblib.dump(train_temporal_features, 'algebra_2005_2008_pickles/train_temporal_features', compress=3)

['algebra_2005_2008_pickles/train_temporal_features']

In [20]:
joblib.dump(test_temporal_features, 'algebra_2005_2008_pickles/test_temporal_features', compress=3)

['algebra_2005_2008_pickles/test_temporal_features']