In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from gensim.models import Word2Vec
from sklearn.manifold import TSNE 
from IPython.core.debugger import set_trace
from sklearn import cluster

In [None]:
response_df = pd.read_csv('correct.tsv', sep='\t').drop('Unnamed: 0', axis=1)
skill_df = pd.read_csv('skill.tsv', sep='\t').drop('Unnamed: 0', axis=1)
assistment_df = pd.read_csv('assistment_id.tsv', sep='\t').drop('Unnamed: 0', axis=1)
skill_dict = {}
with open('skill_dict.json', 'r', encoding='utf-8') as f:
    loaded = json.load(f)
    for k, v in loaded.items():
        skill_dict[k] = int(v)

skill_num = len(skill_dict) + 1 # including 0
#skill_num=len(np.unique(skill_df.iloc[:,2:51]))
#Need to deal with having unequal number of problems for each student. Is this allowable in DKT?
skill_df100=skill_df[skill_df.num_resp>100].copy()
assistment_df100=assistment_df[assistment_df.num_resp>100].copy()
response_df100=response_df[response_df.num_resp>100].copy()

In [None]:
#Create assistment encoding
assistment_re_df100=assistment_df100.copy()
assistment_enc_mat=assistment_df100.iloc[:,2:103].values.astype(int)
assistment_enc=np.reshape(assistment_enc_mat,assistment_enc_mat.shape[0]*assistment_enc_mat.shape[1])

[assistment_enc2,assistment_enc2_counts]=np.unique(assistment_enc,return_counts=True) #Unique assistment in the set, and how often it occurs
num_assist=len(assistment_enc2) #Number of unique assistments
assistment_label=np.arange(num_assist) #Label 0:num_assist

skill_enc_mat=skill_df100.iloc[:,2:103].values.astype(int)
skill_enc2=np.empty((num_assist,1),dtype=int)

for ii in range(num_assist):
    indexArray=assistment_enc_mat==assistment_enc2[ii]
    assistment_enc_mat[indexArray]=ii #0 to num_assist encoding of assistment ids
    skill_enc=skill_enc_mat[indexArray] #All the skills associated with that assistment id (number 0 to num_skill)
    skill_enc2[ii]=skill_enc[0] #Choose the first one
    #Associate assistment with skill
        
assistment_re_df100.iloc[:,2:103]=assistment_enc_mat #Copy over to data frame

# DKT

In [None]:
def one_hot(skill_matrix, vocab_size):
    '''
    params:
        skill_matrix: 2-D matrix (student, skills)
        vocal_size: size of the vocabulary
    returns:
        a ndarray with a shape like (student, sequence_len, vocab_size)
    '''
    seq_len = skill_matrix.shape[1] #Number of exercises (sequence length)
    #Initialize result (student, sequence, one-hot skill) to zeros
    result = np.zeros((skill_matrix.shape[0], seq_len, vocab_size)) 
    #For each student
    for i in range(skill_matrix.shape[0]):
        #Select the student, all sequences, and the related skill; set to 1
        result[i, np.arange(seq_len), skill_matrix[i]] = 1.
    return result

def dkt_one_hot(skill_matrix, response_matrix, vocab_size):
    #Number of exercises/skills
    seq_len = skill_matrix.shape[1]
    #Initialize output (student, sequence, 2 * vocab size) to zeros
    skill_response_array = np.zeros((skill_matrix.shape[0], seq_len, 2 * vocab_size))
    #For each student
    for i in range(skill_matrix.shape[0]):
        #Set to 1 the (student, all sequences, skill location + [0 1] if correct and + [1 0] if incorrect)
        skill_response_array[i, np.arange(seq_len), 2 * skill_matrix[i] + response_matrix[i]] = 1.
    return skill_response_array


#Function to preprocess the data
def preprocess(skill_df, response_df, skill_num):
    skill_matrix = skill_df.iloc[:, 2:].values.astype(int) #Select values (excluding first column, which is index)
    response_array = response_df.iloc[:, 2:].values.astype(int)
    #Get the one-hots associated with each (student, sequence, skill one-hot)
    skill_array = one_hot(skill_matrix, skill_num)
    #Get the one-hots associated with (student, sequence, response one-hot)
    #and (student, sequence, skill one-hot)
    #skill_response_array, masking_array = dkt_one_hot(skill_matrix, response_array, skill_num)
    skill_response_array = dkt_one_hot(skill_matrix, response_array, skill_num)
    return skill_array, response_array, skill_response_array
    

assist_array, response_array, assist_response_array = preprocess(assistment_re_df100.iloc[:,0:103], response_df100.iloc[:,0:103], num_assist)

In [None]:
import keras
from keras.layers import Input, Dense, LSTM, TimeDistributed, Lambda, multiply
from keras.models import Model
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.callbacks import EarlyStopping

#Function to set up skill to skill model (input skills, output skill prediction)
def build_skill2skill_model(input_shape, lstm_dim=32, dropout=0.0):
    input = Input(shape=input_shape, name='input_skills')
    lstm = LSTM(lstm_dim, 
                return_sequences=True, 
                dropout=dropout,
                name='lstm_layer')(input)
    output = TimeDistributed(Dense(input_shape[-1], activation='softmax'), name='probability')(lstm)
    model = Model(inputs=[input], outputs=[output])
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

def reduce_dim(x):
    x = K.max(x, axis=-1, keepdims=True)
    return x

#Skill and response
def build_dkt_model(input_shape, lstm_dim=32, dropout=0.0):
    input_skills = Input(shape=input_shape, name='input_skills') #A, input, [skill(identified) correctness]
    #LSTM hidden layer, processing input_skills
    lstm = LSTM(lstm_dim, 
                return_sequences=True, 
                dropout=dropout,
                name='lstm_layer')(input_skills)
    
    #Output layer, acting on outputs of LSTM
    #Probability of each skill being correct upon the next question
    dense = TimeDistributed(Dense(int(input_shape[-1]/2), activation='sigmoid'), name='probability_for_each')(lstm)
 
    #b, input, [actual next skill (identified)]
    skill_next = Input(shape=(input_shape[0], int(input_shape[1]/2)), name='next_skill_tested')
    #Select the actual next skill's probability
    merged = multiply([dense, skill_next], name='multiply')
    #Get only that result --> this is the output
    reduced = Lambda(reduce_dim, output_shape=(input_shape[0], 1), name='reduce_dim')(merged)
    
    #Optimize using Adam
    model = Model(inputs=[input_skills, skill_next], outputs=[reduced])
    model2 = Model(inputs=[input_skills], outputs=[dense])
    
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model, model2

In [None]:
print('dkt')
dkt_model, dkt_model2 = build_dkt_model((100, 2 * num_assist), lstm_dim=64)

dkt_model.fit([assist_response_array[:, 0:-1], assist_array[:, 1:]],
              response_array[:, 1:, np.newaxis],
              epochs=20, 
              batch_size=32, 
              shuffle=True,
              validation_split=0.2)

## Decrease amount of data for clustering

In [None]:
#Select most popular assistments for each skill 
#106 skills, 9500 or so assistments
#20 assistments/skill
skill_un=np.unique(skill_enc2) #Unique skills in the list (labeled 0 to num_skill)
skill_num=len(skill_un) #Number of unique ones
top_num=30 #Number to select for each skill (unless there are less than this available)

countA=np.zeros(skill_num)

assist_index_test=list()

#Loop through
for ii in range(skill_num):
    
    #Get assistments assoc with each unique skill
    assist_skill=np.arange(num_assist)[skill_enc2[:,0]==skill_un[ii]]
    assist_skill_count=assistment_enc2_counts[assist_skill] #Select the counts for those skills
    
    #Sort
    #sort_ind=np.argsort(assist_skill_count) #Index for sorted skills
    top_num_actual=np.fmin(top_num,len(assist_skill_count)) #Select the smaller - top_num, or length
    top_ind=assist_skill_count.argsort()[::-1][:top_num_actual] #Get the biggest n
        
    
    #Get top ones
    assist_top=assist_skill[top_ind] #Get labels (labeled 0 to num_assist) associated with those assistments
    countA[ii]=top_num_actual
    #Store it
    
    assist_index_test=np.append(assist_index_test,assist_top)
  


In [None]:
assist_index2=assist_index_test.astype('int')#np.arange(num_assist)[assist_index[:,0]] #Index in int labels, not logical index
num_assist2=len(assist_index2)
num_assist2

# Clustering

## Create influence matrix

In [None]:
q_matrix=np.zeros((num_assist2,num_assist))
seq_len=100
for i in range(num_assist2):
    assist_response_array2 = np.zeros((1,seq_len,2 * num_assist)) #Set it up
    assist_response_array2[0, np.arange(seq_len), 2 * assist_index2[i] + 1] = 1. #One-hot for that skill, + correct
    #set_trace()
    temp=dkt_model2.predict(assist_response_array2)
    q_matrix[i,:]=temp[0,0,:]
#Select the sub-matrices
q_matrix_subset=q_matrix[:,assist_index2]

In [None]:
plt.imshow(q_matrix_subset)

In [None]:
q_matrix_sum=np.sum(q_matrix_subset,axis=0)
q_matrix_n=q_matrix_subset/np.tile(q_matrix_sum,[num_assist2, 1])
plt.imshow(q_matrix_n)
q_matrix_sym=np.add(q_matrix_n,q_matrix_n.transpose())/2

## Self-written spectral clustering

In [None]:
import networkx as nx
G=nx.from_numpy_matrix(q_matrix_sym)
L1 = nx.linalg.laplacian_matrix(G)
L1_nd=L1.toarray()
S1 = nx.linalg.laplacian_spectrum(G)
eigVal,eigVec = np.linalg.eig(L1_nd)

In [None]:
featureLen=10;
featureVec=eigVec[:,0:featureLen]

In [None]:
NCr = 64
sse = np.zeros(NCr)
for NC in range(NCr): # number of clusters for K-means
    kmeanse = cluster.KMeans(n_clusters=(NC+1)).fit(featureVec)
    labels = kmeanse.labels_
    centers = kmeanse.cluster_centers_
    for i in range(NC+1):
        sse[NC] += np.sum((featureVec[np.where(labels==i)]-centers[i])**2)
        
plt.plot(np.arange(NCr)+1,sse)

In [None]:
NC = 32# number of clusters

kmeansdkt = cluster.KMeans(n_clusters=NC).fit(featureVec)
labelsdkt = kmeansdkt.labels_

centers = kmeansdkt.cluster_centers_ # returns centers x dimensionality of space

pd.Series(labelsdkt).value_counts()


## Built-in clustering

In [None]:
spectral = cluster.SpectralClustering(n_clusters=32,affinity='precomputed')
#temp=spectral.fit(q_matrix_n)#q_matrix_sym)#
temp2=spectral.fit(q_matrix_sym)#
#dkt_clusters=spectral.fit_predict(q_matrix_n)#q_matrix_n)
dkt_clusters2=spectral.fit_predict(q_matrix_sym)#q_matrix_n)

In [None]:
pd.Series(dkt_clusters2).value_counts()


In [None]:
#Convert dict to an array
d2={value:key for key, value in skill_dict.items()}
skill_list=list(range(107))
for key, value in d2.items():
    skill_list[int(key)]=value
    
skill_list=np.array(skill_list)

#Select the skill labels that are actually used
skill_enc3=skill_enc2[assist_index2]
skill_list_enc2=skill_list[skill_enc2] #Convert to words
skill_list_enc3=skill_list[skill_enc3[:,0]]
assistment_enc3=assistment_enc2[assist_index2]

# TSNE

In [None]:
featureLen=10;
featureVec=eigVec[:,0:featureLen]
tsne=TSNE(perplexity=30) #Instantiate the TSNE model (can change params here)
assist_tsne=tsne.fit_transform(featureVec.astype(float)) #Run tsne


tsne_save=pd.DataFrame({'x': assist_tsne[:,0],
  'y': assist_tsne[:,1],
  'skill' : skill_list_enc3})
tsne_save.to_csv('../d3-scatterplot/tsne_dkt_assist.tsv',sep='\t',index=False,columns=['x','y','skill'])

In [None]:
np.savez('../data/skills_dkt_fixed.npz', skills_dkt=skill_list_enc3)
np.savez('../data/assist_dkt_fixed.npz', assist_dkt=assistment_enc3)
np.savez('../data/labels_dkt_fixed.npz', labels_dkt=dkt_clusters2)

In [None]:
np.savez('../data/labels_dkt_self.npz', labels_dkt=labelsdkt)
np.savez('../data/vectors_dkt.npz', vectors_dkt=featureVec)