In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from gensim.models import Word2Vec
from sklearn.manifold import TSNE 

In [None]:
#Read in skill builder dataset
filename = 'skill_builder_data_corrected.csv'
df = pd.read_csv(filename, encoding='ISO-8859-1', low_memory=False)
df = df[(df['original'] == 1) & (df['attempt_count'] == 1) & ~(df['skill_name'].isnull())]

In [None]:
#Read in problem text dataset
filename2='../data/problems.csv'
problems=pd.read_csv(filename2, encoding='ISO-8859-1', low_memory=False)

In [None]:
#Select students that have attempted more than n problems 
students_list=df.groupby('user_id').problem_id.count()
students_id=students_list[students_list>50].index #Get the associated user_id
df2=df[df['user_id'].isin(students_id)] #Select only the rows containing those students

In [None]:
#Merge the two datasets based on the problem_id and assistment_id
#Each assistment_id can have multiple problem_id's
#But each problem_id appears to only be associated with 1 assistment_id
df3=pd.merge(df2,problems,on=['assistment_id','problem_id'],how='left',indicator=True)
#Will still include rows where there is no corresponding problem description for the problem_id
df3.shape

In [None]:
#Different way to merge
df4=pd.merge(df2,problems,on=['assistment_id','problem_id'],how='inner')
#Will cut out rows where there is no corresponding problem description for the problem_id
#(This amounts to 120 rows and 23 unique problem_id's.)
df4.shape

In [None]:
#Count the ones without problem text
nondescript=df3[df3['_merge']=='left_only']['problem_id']
print('Number of rows without description: ', nondescript.size)
print('Number of unique problems without description: ', nondescript.nunique())

In [None]:
#Make sure each problem_id is only associated with 1 unique assistment_id
any(df3.groupby('problem_id').assistment_id.nunique()>1)
#Woo! No problem is associated with more than one assistment_id

In [None]:
#Random queries
df[df['problem_id']==58551]['assistment_id'].nunique()
df[df['assistment_id']==76958]['problem_id'].nunique()

In [None]:
#Number of unique labeled skills (107)
df3.skill_name.nunique()

In [None]:
problems.columns

In [None]:
df.groupby(['user_id','problem_id'])['skill_id'].nunique()

In [None]:
df[(df.problem_id==57647) & (df.user_id==14)]['position']
#Multiple rows for a particular problem with multiple skills are the same (except for the skill info)

In [None]:
df[df['original']==1].problem_id.nunique()

In [None]:
df.groupby(['user_id','problem_id']).size()

In [None]:
df.problem_id.nunique()

In [None]:
df3.columns

# Clustering

# DKT

In [None]:
response_df = pd.read_csv('correct.tsv', sep='\t').drop('Unnamed: 0', axis=1)
skill_df = pd.read_csv('skill.tsv', sep='\t').drop('Unnamed: 0', axis=1)
assistment_df = pd.read_csv('assistment_id.tsv', sep='\t').drop('Unnamed: 0', axis=1)
skill_dict = {}
with open('skill_dict.json', 'r', encoding='utf-8') as f:
    loaded = json.load(f)
    for k, v in loaded.items():
        skill_dict[k] = int(v)

skill_num = len(skill_dict) + 1 # including 0

In [None]:
def one_hot(skill_matrix, vocab_size):
    '''
    params:
        skill_matrix: 2-D matrix (student, skills)
        vocal_size: size of the vocabulary
    returns:
        a ndarray with a shape like (student, sequence_len, vocab_size)
    '''
    seq_len = skill_matrix.shape[1] #Number of exercises (sequence length)
    #Initialize result (student, sequence, one-hot skill) to zeros
    result = np.zeros((skill_matrix.shape[0], seq_len, vocab_size)) 
    #For each student
    for i in range(skill_matrix.shape[0]):
        #Select the student, all sequences, and the related skill; set to 1
        result[i, np.arange(seq_len), skill_matrix[i]] = 1.
    return result

def dkt_one_hot(skill_matrix, response_matrix, vocab_size):
    #Number of exercises/skills
    seq_len = skill_matrix.shape[1]
    #Initialize output (student, sequence, 2 * vocab size) to zeros
    skill_response_array = np.zeros((skill_matrix.shape[0], seq_len, 2 * vocab_size))
    #For each student
    for i in range(skill_matrix.shape[0]):
        #Set to 1 the (student, all sequences, skill location + [0 1] if correct and + [1 0] if incorrect)
        skill_response_array[i, np.arange(seq_len), 2 * skill_matrix[i] + response_matrix[i]] = 1.
    return skill_response_array
#Function to preprocess the data
def preprocess(skill_df, response_df, skill_num):
    skill_matrix = skill_df.iloc[:, 1:].values #Select values (excluding first column, which is index)
    response_array = response_df.iloc[:, 1:].values
    #Get the one-hots associated with each (student, sequence, skill one-hot)
    skill_array = one_hot(skill_matrix, skill_num)
    #Get the one-hots associated with (student, sequence, response one-hot)
    #and (student, sequence, skill one-hot)
    #skill_response_array, masking_array = dkt_one_hot(skill_matrix, response_array, skill_num)
    skill_response_array = dkt_one_hot(skill_matrix, response_array, skill_num)
    return skill_array, response_array, skill_response_array, masking_array
    

skill_array, response_array, skill_response_array = preprocess(skill_df, response_df, skill_num)

In [None]:
import keras
from keras.layers import Input, Dense, LSTM, TimeDistributed, Lambda, multiply
from keras.models import Model
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

#Function to set up skill to skill model (input skills, output skill prediction)
def build_skill2skill_model(input_shape, lstm_dim=32, dropout=0.0):
    input = Input(shape=input_shape, name='input skills')
    lstm = LSTM(lstm_dim, 
                return_sequences=True, 
                dropout=dropout,
                name='lstm layer')(input)
    output = TimeDistributed(Dense(input_shape[-1], activation='softmax'), name='probability')(lstm)
    model = Model(inputs=[input], outputs=[output])
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

def reduce_dim(x):
    x = K.max(x, axis=-1, keepdims=True)
    return x

#Skill and response
def build_dkt_model(input_shape, lstm_dim=32, dropout=0.0):
    input_skills = Input(shape=input_shape, name='input_skills')
    lstm = LSTM(lstm_dim, 
                return_sequences=True, 
                dropout=dropout,
                name='lstm_layer')(input_skills)
    dense = TimeDistributed(Dense(int(input_shape[-1]/2), activation='sigmoid'), name='probability for each')(lstm)
    
    skill_next = Input(shape=(input_shape[0], int(input_shape[1]/2)), name='next_skill_tested')
    merged = multiply([dense, skill_next], name='multiply')
    reduced = Lambda(reduce_dim, output_shape=(input_shape[0], 1), name='reduce dim')(merged)
    
    model = Model(inputs=[input_skills, skill_next], outputs=[reduced])
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

print('skill2skill')
skill2skill_model = build_skill2skill_model((99, skill_num), lstm_dim=64)

print('dkt')
dkt_model = build_dkt_model((99, 2 * skill_num), lstm_dim=64)

# train skill2skill
skill2skill_model.fit(skill_array[:, 0:-1], 
                      skill_array[:, 1:],
                      epochs=20, 
                      batch_size=32, 
                      shuffle=True,
                      validation_split=0.2)

dkt_model.fit([skill_response_array[:, 0:-1], skill_response_array[:, 1:]],
              response_array[:, 1:, np.newaxis],
              epochs=20, 
              batch_size=32, 
              shuffle=True,
              validation_split=0.2)

In [None]:
#Our code
# modified lstm_dim in different trials
skill2skill_model = build_skill2skill_model((99, skill_num), lstm_dim=64)

# added early stopping + increased number of epochs for later trials
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
skill2skill_model.fit(skill_array_train[:, 0:-1], 
                      skill_array_train[:, 1:],
                      epochs=20, 
                      batch_size=32, 
                      shuffle=True,
                      validation_data=validation_set,
                      callbacks=[early_stopping])

# check accuracy of model on test set
testx = skill2skill_model.predict(skill_array_test[:, 0:-1])
testy = skill_array_test[:,1:];

skill_predicted = np.argmax(testx,axis=2) # which skill was predicted
skill_gndtruth = np.argmax(testy, axis=2) # which skill was really next
pred_acc = (skill_predicted==skill_gndtruth) # if the prediction was correct

print('Accuracy: ', np.mean(pred_acc))

# percent correct by skill

testx = skill2skill_model.predict(skill_array_test[:, 0:-1])
testy = skill_array_test[:,1:];

skill_predicted = np.argmax(testx,axis=2) # which skill was predicted
skill_gndtruth = np.argmax(testy, axis=2) # which skill was really next
pred_acc = (skill_predicted==skill_gndtruth) # if the prediction was correct

num_corr_skill = np.zeros(skill_num) # number of time skill was correctly predicted
num_occurrence_skill = np.zeros(skill_num) # number of times the skill appeared
for i in np.arange(testx.shape[0]):
    for j in np.arange(testx.shape[1]):
        num_occurrence_skill[skill_gndtruth[i,j]] += 1;
        num_corr_skill[skill_gndtruth[i,j]] += pred_acc[i,j]

skill_acc = num_corr_skill/num_occurrence_skill

plt.plot(num_occurrence_skill, skill_acc, 'o')
plt.xlabel('Number of times skill occurred')
plt.ylabel('Prediction Accurracy')
plt.grid('on')

# inverse dictionart to find skill names from values
skill_dict_inv = {v: k for k, v in skill_dict.items()}

k = 5 # top 5 easiest/hardest to predict

# easiest to predict skills
num_nans = np.sum(np.isnan(skill_acc));
top_skill_index = np.flip(skill_acc.argsort()[(-num_nans-k):(-num_nans)], axis=0)
print('Easiest to predict')
print('Index: ', top_skill_index)
print('Accuracy: ', skill_acc[top_skill_index])
print('Num Occurrences: ', num_occurrence_skill[top_skill_index])
for i in np.arange(k):
    print(skill_dict_inv[top_skill_index[i]])

# hardest to predict skills
# disregard skills with fewer than 50 occurrences in test set
skill_acc_modified = skill_acc
skill_acc_modified[num_occurrence_skill < 50] = np.nan
bottom_skill_index = skill_acc.argsort()[:k]
print('\nHardest to predict')
print('Index: ',bottom_skill_index)
print('Accuracy: ', skill_acc[bottom_skill_index])
print('Num Occurrences: ',num_occurrence_skill[bottom_skill_index])
for i in np.arange(k):
    print(skill_dict_inv[bottom_skill_index[i]])
  

validation_set_dkt=([skill_response_array[testing_mask, 0:-1], skill_array[testing_mask, 1:]], response_array[testing_mask, 1:, np.newaxis] );
dkt_model.fit([skill_response_array[training_mask, 0:-1], skill_array[training_mask, 1:]],
              response_array[training_mask, 1:, np.newaxis],
              epochs=20, 
              batch_size=32, 
              shuffle=True,
              validation_data=validation_set_dkt)

    import sklearn as sklearn
#Predicted values
response_predict=dkt_model.predict([skill_response_array[testing_mask, 0:-1], skill_array[testing_mask, 1:]])
#Actual values
response_actual=response_array[testing_mask,1:]

#Find AUC
score=sklearn.metrics.roc_auc_score(np.reshape(response_actual,(-1,1)),np.reshape(response_predict,(-1,1)))
print(score)

#Build model
dkt_model = build_dkt_model((99, 2 * skill_num), lstm_dim=128)

#Train model
#early_stopping = EarlyStopping(monitor='val_loss', patience=2)
dkt_model.fit([skill_response_array[training_mask, 0:-1], skill_array[training_mask, 1:]],
              response_array[training_mask, 1:, np.newaxis],
              epochs=50, 
              batch_size=32, 
              shuffle=True,
              validation_data=validation_set_dkt)#,
              #callbacks=[early_stopping])

# Word2Vec

In [None]:
#Skills
#Load in skills dataframe
skill_df = pd.read_csv('skill.tsv', sep='\t').drop('Unnamed: 0', axis=1)
#Load in dictionary associating skill numbers with skill names
skill_dict = {}
with open('skill_dict.json', 'r', encoding='utf-8') as f:
    loaded = json.load(f)
    for v, k in loaded.items():
        skill_dict[k] = str(v) #Use number as key, string as value

        #Read out the "sentences"
sentences=skill_df.iloc[:,1:].values.astype(str)
sentences=sentences.tolist()

#Each student is a "sentence", each skill is a "word"
#size = dimensionality of feature vectors
#window = max distance between current and predicted word within a sentence
#min_count = minimum number of occurrences within dataset
#workers = number of threads used
#sg = 0 (CBOW, default); = 1 (skip-gram)
model = Word2Vec(sentences, size=200, window=10, min_count=10, workers=4, sg=1, iter=100)

skill_num=model.wv.vocab; #Names of the words (numbers)
skill_vec=model[skill_num] #Access the vectors

skill_name=list()
#Associate with readable words
for k,v in skill_num.items(): #Iterate over the vocab from word2vec (k = key = number string)
    skill_name.append(skill_dict.get(k)) #Get the value (tag) saved at that key in the other dict
#print(skill_name)

tsne=TSNE(perplexity=30) #Instantiate the TSNE model (can change params here)
skill_tsne=tsne.fit_transform(skill_vec.astype(float)) #Run tsne

#Save as a tsv file for d3-scatterplot
# d={'x': skill_tsne[:,0],
#   'y': skill_tsne[:,1],
#   'skill' : skill_name}
tsne_save=pd.DataFrame({'x': skill_tsne[:,0],
  'y': skill_tsne[:,1],
  'skill' : skill_name})
tsne_save.to_csv('../d3-scatterplot/tsne_skills.tsv',sep='\t',index=False,columns=['x','y','skill'])

In [None]:
#Assistments
# Load in Assistments ID dataframe
assistment_df = pd.read_csv('assistment_id.tsv', sep='\t').drop('Unnamed: 0', axis=1)
sentences=assistment_df.iloc[:,1:].values.astype(str)
sentences=sentences.tolist()

# Load entire Assistment dataframe to find the skills associated with each ID
filename = 'skill_builder_data_corrected.csv'
df = pd.read_csv(filename, encoding='ISO-8859-1', low_memory=False)
df = df[(df['original'] == 1) & (df['attempt_count'] == 1) & ~(df['skill_name'].isnull())]

#Each student is a "sentence", each skill is a "word"
#size = dimensionality of feature vectors
#window = max distance between current and predicted word within a sentence
#min_count = minimum number of occurrences within dataset
#workers = number of threads used
#sg = 0 (CBOW, default); = 1 (skip-gram)
model = Word2Vec(sentences, size=200, window=10, min_count=10, workers=4, sg=1, iter=30)

assist_num=model.wv.vocab; #Names of the words (numbers)
assist_vec=model[assist_num] #Access the vectors

tsne=TSNE(perplexity=30) #Instantiate the TSNE model (can change params here)
assist_tsne=tsne.fit_transform(assist_vec.astype(float)) #Run tsne

assist_skill=list()

for k,v in assist_num.items():
    skill = df[df['assistment_id'] == int(k)]['skill_name'].iloc[0] # get the first skill associated with the assistment
    assist_skill.append(skill)

    #Save as a tsv file for d3-scatterplot
# d={'x': skill_tsne[:,0],
#   'y': skill_tsne[:,1],
#   'skill' : skill_name}
tsne_save=pd.DataFrame({'x': assist_tsne[:,0],
  'y': assist_tsne[:,1],
  'skill' : assist_skill})
tsne_save.to_csv('../d3-scatterplot/tsne_assist.tsv',sep='\t',index=False,columns=['x','y','skill'])

In [None]:
#Define
def generate_datasets(df):
    users_list = df['user_id'].unique() #List of unique user ids
    skill_list = df['skill_name'].unique() #List of unique skills
    
    #Create skill dict object
    skill_dict = dict(zip(skill_list, np.arange(len(skill_list), dtype='int32') + 1))
    
    #Initialize response, skill, and assistment list
    response_list = []
    skill_list = []
    assistment_list = []
    
    counter = 0
    #For each user
    for user in users_list:
        #Select all the data for that user
        sub_df = df[df['user_id'] == user]
        num_resp=len(sub_df)
        #number of responses for that user
        #If that user has more than 100 responses
        if num_resp > 50:
            #Select the first hundred responses, skills, and assistments
            
            first_hundred = sub_df.iloc[0:num_resp]
            #Create the dataframe spaces to hold the data
            response_df = pd.DataFrame(index=[counter], columns=['student_id']+['r'+str(i) for i in range(num_resp)])
            skill_df = pd.DataFrame(index=[counter], columns=['student_id']+['s'+str(i) for i in range(num_resp)])
            assistment_df = pd.DataFrame(index=[counter], columns=['student_id']+['a'+str(i) for i in range(num_resp)])
            
            #Copy over the user id info
            response_df.iloc[0, 0] = first_hundred.iloc[0]['user_id']
            skill_df.iloc[0, 0] = first_hundred.iloc[0]['user_id']
            assistment_df.iloc[0, 0] = first_hundred.iloc[0]['user_id']
            #fill in the responses, skills, and assistments
            for i in range(num_resp):
                response_df.iloc[0, i+1] = first_hundred.iloc[i]['correct']
                skill_df.iloc[0, i+1] = skill_dict[first_hundred.iloc[i]['skill_name']]
                assistment_df.iloc[0, i+1] = first_hundred.iloc[i]['assistment_id']
            counter += 1
            #Add to the overall list
            response_list.append(response_df)
            skill_list.append(skill_df)
            assistment_list.append(assistment_df)
    
    #Convert to a dataframe
    response_df = pd.concat(response_list)
    skill_df = pd.concat(skill_list)
    assistment_df = pd.concat(assistment_list)
    
    #Return
    return skill_dict, response_df, skill_df, assistment_df
    
#Use function to generate the dataset as required
skill_dict, response_df, skill_df, assistment_df = generate_datasets(df3)
    

In [None]:
print(assistment_df)

In [None]:
#Save the data files
with open('skill_dict.json', 'w', encoding='utf-8') as f:
    to_dump_dict = {}
    for key, value in skill_dict.items():
        to_dump_dict[key] = str(value)
    json.dump(to_dump_dict, f)
response_df.to_csv('correct.tsv', sep='\t')
skill_df.to_csv('skill.tsv', sep='\t')
assistment_df.to_csv('assistment_id.tsv', sep='\t')
print('Done')

In [None]:
assistment_df = pd.read_csv('assistment_id.tsv', sep='\t').drop('Unnamed: 0', axis=1)


In [None]:
sentences=assistment_df.iloc[:,:-1].values.astype(str) #Get rid of user_id
sentences=sentences.tolist()

In [None]:
print(sentences)