## Steps for Format Data

1. Split data on SID
1. For each student, order questions by time
1. For each student, split questions by concept
1. Convert into a sequence of 1's and 0's
1. Combine all concept sequences for all sequences



In [1]:
import pandas as pd
import numpy as np
import csv
import scipy.io as sio

In [2]:
# Some rows of KDD data contain multiple concepts. Split these into multiple rows, one per concept.
# Additionally, remove unlabeled rows (rows with no labeled concept).

infile_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train.txt"
outfile_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train_clean.txt"

with open(infile_path,'r') as infile, open(outfile_path, 'w') as outfile:
    csvreader = csv.reader(infile, dialect="excel-tab")
    csvwriter = csv.writer(outfile, dialect = "excel-tab")
    for row in csvreader:
        if not row[19]:
            continue
        # Write one row per concept in 'kc_ktracedskills' column 19
        concepts = row[19].split('~~')
        opportunities = row[20].split('~~')
        for c, o in zip(concepts, opportunities):
            temprow = row
            temprow[19] = c
            temprow[20] = o
            csvwriter.writerow(temprow)

In [3]:
# Create a Dictionary Linking Student ID to Number of Questions and Concept ID to Number of Questions

def get_kdd_info(file_path):
    info = dict()
    student_count = dict()
    concept_count = dict()
    with open(file_path,'r') as data_file:
        csvreader = csv.reader(data_file, dialect="excel-tab")
        for row in csvreader:
            studentid = row[1]
            conceptid = row[19]
            if studentid not in student_count:
                student_count[studentid] = 1
            else:
                student_count[studentid] += 1
            if conceptid not in concept_count:
                concept_count[conceptid] = 1
            else:
                concept_count[conceptid] += 1
    info['student_count'] = student_count
    info['concept_count'] = concept_count
    return info

#file_path = "/Users/qandeeltariq/Desktop/kddcup_challenge/bridge_to_algebra_2008_2009_train.txt"
file_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train_clean.txt"

kdd_info = get_kdd_info(file_path)
print kdd_info['concept_count']
print kdd_info['student_count']

In [6]:
print "In the cleaned data there are %d concepts and %d students." % (len(kdd_info['concept_count']), len(kdd_info['student_count']))

In the cleaned data there are 808 concepts and 5986 students.


In [6]:
# Subsetting the data to 50 random students
def get_random_sample(kdd_info, sample_size):
    sid_list = []
    for sid, numqs in kdd_info['student_count'].iteritems():
        sid_list.append(sid)

    return np.random.choice(sid_list, sample_size, replace=False)
    
#random_students = get_random_sample(student_ids, 50)

In [7]:
def get_top_n_concepts(concept_dict, n, clean=True, top=True):
    c = concept_dict
    if top:
        s = sorted(c, key=c.get, reverse=True)
    else:
        s = sorted(c, key=c.get, reverse=False)
    #s = sorted(c.items(), key=lambda x:x[1], reverse=True)
    top_items = []
    for entry in s:
        if clean:
            if 'enter' in entry.lower() or not entry:
                continue
        top_items.append(entry)
        if len(top_items) >= n:
            break
    return top_items
    
n_concepts = get_top_n_concepts(kdd_info['concept_count'], 100, clean=False, top=True)

In [9]:
def choose_random_concepts(concept_list, sample_size):
    return np.random.choice(concept_list, sample_size, replace=False)

random_concepts = choose_random_concepts(n_concepts, 10)

array(['Identify number as common multiple-1',
       'Identify number as common factor-1',
       'Compare Options - operation-1',
       'Identify percent change as increase or decrease-1',
       'Identify common denominator-1',
       'Enter smaller initial in diagram -- given-1',
       'Identify no more factors-1', 'Enter items numerator-1',
       'Identify proper from option 2-1',
       'Calculate difference digit -- borrow in-1'], 
      dtype='|S60')

In [10]:
# Make a modified subset of the KDD Cup data file that has data from our random sample of students.
# This is tailored for the KDD Cup data; we will have to modify this when using other data sources.
# The CSV file is nearly identical to input file, with two changes:
# 1. Some questions are assigned multiple concepts; we split these into one row per concept
# 2. We only keep data from a random sample of students

infile_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train_clean.txt"
subset_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train_nConcepts.txt"

total_rows = 0
written_rows = 0
with open(infile_path,'r') as infile, open(subset_path, 'w') as outfile:
    csvreader = csv.reader(infile, dialect="excel-tab")
    csvwriter = csv.writer(outfile, dialect = "excel-tab")
    for row in csvreader:
        concept_id = row[19]
        total_rows += 1
        if concept_id not in random_concepts:
        #if concept_id not in top_concepts:
            continue
        csvwriter.writerow(row)
        written_rows += 1
        
print "Wrote %d rows out of %d total rows." % (written_rows, total_rows)

n_concept_info = get_kdd_info(subset_path)
print "In the subsetted data there are %d concepts and %d students." % (len(n_concept_info['concept_count']), len(n_concept_info['student_count']))

12536294
993661


In [11]:
n_concepts_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train_nConcepts.txt"


In the subsetted data there are 10 concepts and 4938 students.


In [23]:
# Write data to two matrices

def convert_to_matrices(csvfile, min_seq_length):
    kdd_col_names = ['row','student_id','problem_hierarchy', 'problem_name', 'problem_view','step_name','step_start_time','first_transaction_time','correct_transaction_time','step_end_time', 'step_duration','correct_step_duration', 'error_step_duration','correct_first_attempt', 'incorrects', 'hints', 'corrects', 'kc_subskills', 'opportunity_subskills', 'kc_ktracedskills', 'opportunity_ktracedskills']
    pd_allstudents = pd.read_csv(csvfile, sep='\t', names=kdd_col_names, parse_dates=[6,7,8,9], infer_datetime_format=True)
    pd_allstudents = pd_allstudents.sort_values('step_start_time') # Sort by datetime
    
    grouped_by_student = pd_allstudents.groupby(['student_id'])
    
    max_seq_len = 0
    for name,group in grouped_by_student:
        # name is student_id, group is that student's data
        if group.shape[0] > max_seq_len:
            max_seq_len = group.shape[0]
    
    concept_to_id = {}
    id_to_concept = {}
    concepts = set(pd_allstudents['kc_ktracedskills'])
    i = 1
    for c in concepts:
        concept_to_id[c] = i
        id_to_concept[i] = c
        i += 1
    #print concept_to_id
    #print id_to_concept
    
    student_to_id = {}
    id_to_student = {}
    students = set(pd_allstudents['student_id'])
    i = 0
    for s in students:
        student_to_id[s] = i
        id_to_student[i] = s
        i += 1    
    #print student_to_id
    #print id_to_student
    
    answer_matrix = np.zeros([len(grouped_by_student), max_seq_len])
    concept_matrix = np.zeros([len(grouped_by_student), max_seq_len])   
    
    for name, group in grouped_by_student:
        
        # Drop concepts where student answered fewer than min_seq_length questions
        cs_per_student = group['kc_ktracedskills'].value_counts()
        
        # Find students whose number of questions answered is below answer_threshold
        to_keep = []
        for k,v in cs_per_student.iteritems():
            if v >= min_seq_length:
                to_keep.append(k)

        group = group[group.kc_ktracedskills.isin(to_keep)]
        
        outrow = student_to_id[name]
        i = 0
        for index, row in group.iterrows():
            concept = row['kc_ktracedskills']
            concept_id = concept_to_id[concept]
            answer = row['correct_first_attempt'] + 1
            answer_matrix[outrow, i] = answer
            concept_matrix[outrow, i] = concept_id
            i += 1
            
    # Drop rows of all zeros before writing to .mat file
    answer_matrix = answer_matrix[~np.all(answer_matrix == 0, axis=1)]
    concept_matrix = concept_matrix[~np.all(concept_matrix == 0, axis=1)]    
    
    # Write the 500 rows with the most data
    concept_indicator = concept_matrix > 0
    concept_count = np.sum(concept_matrix > 0, axis=1)
    row_to_count = {}
    for i in range(len(concept_count)):
        row_to_count[i] = concept_count[i]
    #sorted_rows = sorted(row_to_count.items(), key=lambda x:x[1])
    sorted_rows = sorted(row_to_count, key=row_to_count.get, reverse=True)
    rows_to_keep = sorted_rows[:500]

    answer_matrix = answer_matrix[rows_to_keep, :]
    concept_matrix = concept_matrix[rows_to_keep, :]    
    
    return [answer_matrix, concept_matrix, concept_to_id, id_to_concept, student_to_id, id_to_student]
    
n_concepts_path = "/Volumes/Slim2TB/classes/cs229/project/data/KDD Cup/kddcup_challenge/bridge_to_algebra_2008_2009_train_nConcepts.txt"
X, C, c_id, id_c, s_id, id_s = convert_to_matrices(n_concepts_path, 10)

[3582, 3472, 3189, 3701, 831, 4188, 777, 480, 268, 1996, 2929, 1427, 2765, 2090, 4009, 2308, 16, 835, 3029, 3941, 3600, 1709, 2767, 2750, 1410, 3432, 3348, 3624, 3466, 797, 3745, 2762, 3973, 350, 3057, 715, 386, 888, 3885, 2023, 1322, 3229, 322, 2648, 134, 3719, 4225, 1899, 522, 2350, 3530, 862, 1771, 2943, 3815, 179, 1745, 2261, 3129, 2383, 1067, 3835, 520, 3017, 2949, 3426, 2544, 2570, 1591, 1049, 2921, 2626, 2661, 4219, 2487, 2985, 1621, 2144, 2102, 2721, 985, 1072, 923, 1210, 2452, 1845, 289, 1636, 2477, 3015, 504, 4207, 2885, 3494, 318, 2470, 647, 2788, 392, 976, 3489, 675, 468, 844, 388, 3286, 1154, 1867, 1311, 3979, 2930, 978, 3215, 730, 2540, 860, 832, 2438, 2701, 1081, 2209, 1875, 3797, 903, 3382, 2129, 116, 206, 3086, 3070, 2031, 4190, 1859, 3350, 1930, 2651, 4112, 4052, 4124, 191, 1103, 3512, 1011, 1588, 1425, 2226, 764, 1085, 2691, 3966, 1816, 453, 853, 2432, 1519, 1607, 3992, 1043, 3415, 12, 2871, 2374, 788, 1388, 2685, 3939, 1334, 2703, 4012, 611, 3101, 3270, 2490, 3145, 

In [25]:
print X.shape
print C.shape
print len(c_id)
print len(id_c)
print len(s_id)
print len(id_s)
print type(X)
sio.savemat('KDD_sample.mat', {'KDD_sample_X':X, 'KDD_sample_C':C})

(500, 1368)
(500, 1368)
10
10
4938
4938
<type 'numpy.ndarray'>


In [17]:
print X.shape
print C.shape
print len(c_id)
print len(id_c)
print len(s_id)
print len(id_s)
print type(X)
sio.savemat('KDD.mat', {'KDD_X':X, 'KDD_C':C})

(4235, 1368)
(4235, 1368)
10
10
4938
4938
<type 'numpy.ndarray'>


In [22]:
# Combine sequences of ansewrs as 1's and 0's by concept, where all students'
# answers are in a list of lists

from collections import defaultdict
final_dict = defaultdict(dict)
for student, answers in list_alldata.iteritems():
    for concept, sequence in answers.iteritems():
        # Clean concept names so matlab can read them
        new_concept = concept.replace(' ', '_')
        new_concept = new_concept.lower()
        new_concept = "".join([ c if c.isalnum() else "_" for c in new_concept ])        
        final_dict[new_concept][student] = sequence
        
#sio.savemat('../matlab/final_dict.mat', final_dict)