In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from gensim.models import Word2Vec
from sklearn.manifold import TSNE 


Using TensorFlow backend.


# Visualizing Skills

In [35]:
#Load in skills dataframe
skill_df = pd.read_csv('skill.tsv', sep='\t').drop('Unnamed: 0', axis=1)
#Load in dictionary associating skill numbers with skill names
skill_dict = {}
with open('skill_dict.json', 'r', encoding='utf-8') as f:
    loaded = json.load(f)
    for v, k in loaded.items():
        skill_dict[k] = str(v) #Use number as key, string as value

In [36]:
#Read out the "sentences"
sentences=skill_df.iloc[:,1:].values.astype(str)
sentences=sentences.tolist()

In [37]:
#Each student is a "sentence", each skill is a "word"
#size = dimensionality of feature vectors
#window = max distance between current and predicted word within a sentence
#min_count = minimum number of occurrences within dataset
#workers = number of threads used
#sg = 0 (CBOW, default); = 1 (skip-gram)
model = Word2Vec(sentences, size=200, window=10, min_count=10, workers=4, sg=1, iter=100)

In [38]:
skill_num=model.wv.vocab; #Names of the words (numbers)
skill_vec=model[skill_num] #Access the vectors

skill_name=list()
#Associate with readable words
for k,v in skill_num.items(): #Iterate over the vocab from word2vec (k = key = number string)
    skill_name.append(skill_dict.get(k)) #Get the value (tag) saved at that key in the other dict
#print(skill_name)

In [39]:
tsne=TSNE(perplexity=30) #Instantiate the TSNE model (can change params here)
skill_tsne=tsne.fit_transform(skill_vec.astype(float)) #Run tsne

In [40]:
#Save as a tsv file for d3-scatterplot
# d={'x': skill_tsne[:,0],
#   'y': skill_tsne[:,1],
#   'skill' : skill_name}
tsne_save=pd.DataFrame({'x': skill_tsne[:,0],
  'y': skill_tsne[:,1],
  'skill' : skill_name})
tsne_save.to_csv('../d3-scatterplot/tsne_skills.tsv',sep='\t',index=False,columns=['x','y','skill'])

# Visualizing Assistments

In [82]:
# Load in Assistments ID dataframe
assistment_df = pd.read_csv('assistment_id.tsv', sep='\t').drop('Unnamed: 0', axis=1)
sentences=assistment_df.iloc[:,1:].values.astype(str)
sentences=sentences.tolist()

In [83]:
# Load entire Assistment dataframe to find the skills associated with each ID
filename = 'skill_builder_data_corrected.csv'
df = pd.read_csv(filename, encoding='ISO-8859-1', low_memory=False)
df = df[(df['original'] == 1) & (df['attempt_count'] == 1) & ~(df['skill_name'].isnull())]

In [84]:
#Each student is a "sentence", each skill is a "word"
#size = dimensionality of feature vectors
#window = max distance between current and predicted word within a sentence
#min_count = minimum number of occurrences within dataset
#workers = number of threads used
#sg = 0 (CBOW, default); = 1 (skip-gram)
model = Word2Vec(sentences, size=200, window=10, min_count=10, workers=4, sg=1, iter=30)

In [85]:
assist_num=model.wv.vocab; #Names of the words (numbers)
assist_vec=model[assist_num] #Access the vectors

tsne=TSNE(perplexity=30) #Instantiate the TSNE model (can change params here)
assist_tsne=tsne.fit_transform(assist_vec.astype(float)) #Run tsne

In [89]:
assist_skill=list()

for k,v in assist_num.items():
    skill = df[df['assistment_id'] == int(k)]['skill_name'].iloc[0] # get the first skill associated with the assistment
    assist_skill.append(skill)


In [93]:
#Save as a tsv file for d3-scatterplot
# d={'x': skill_tsne[:,0],
#   'y': skill_tsne[:,1],
#   'skill' : skill_name}
tsne_save=pd.DataFrame({'x': assist_tsne[:,0],
  'y': assist_tsne[:,1],
  'skill' : assist_skill})
tsne_save.to_csv('../d3-scatterplot/tsne_assist.tsv',sep='\t',index=False,columns=['x','y','skill'])

In [87]:
assist_vec.shape

(1870, 200)

In [92]:
dfskills = pd.DataFrame(np.array(assist_skill) )
dfskills.iloc[:,0].nunique()

34

In [88]:
skill_vec.shape

(80, 200)

In [51]:
assistment_df.values[:,1:]

array([[33139, 33150, 52640, ..., 41094, 50138, 41095],
       [33110, 33172, 33174, ..., 36872, 36949, 36858],
       [33168, 33112, 31968, ..., 36537, 33168, 41674],
       ..., 
       [55629, 55660, 55643, ..., 34348, 47967, 48057],
       [55648, 55622, 55641, ..., 51042, 34516, 34413],
       [34062, 34021, 34054, ..., 34735, 34654, 34721]])

In [100]:
ar = np.reshape(skill_df.values[:,1:], (skill_df.values[:,1:].shape[0]*skill_df.values[:,1:].shape[1], 1))
dfar = pd.DataFrame(ar )
dfar.iloc[:,0].nunique()

94

In [91]:
skill_vals = skill_df.values[:,1:]
assist_skill=list()
for k,v in assist_num.items():
    ind = np.where(assistment_df.values[:,1:] == int(k))
    key = skill_vals[ind][0]
    #skill = skill_df[skill_df['assistment_id'] == int(k)]['skill_name'].iloc[0] # get the first skill associated with the assistment
    assist_skill.append(skill_dict.get(key.astype(str)))

In [68]:
print(assist_skill)

['Effect of Changing Dimensions of a Shape Prportionally', 'Ordering Fractions', 'Ordering Integers', 'Scatter Plot', 'Circle Graph', 'Circle Graph', 'Box and Whisker', 'Box and Whisker', 'Ordering Fractions', 'Stem and Leaf Plot', 'Box and Whisker', 'Stem and Leaf Plot', 'Conversion of Fraction Decimals Percents', 'Venn Diagram', 'Table', 'Ordering Positive Decimals', 'Table', 'Circle Graph', 'Square Root', 'Estimation', 'Conversion of Fraction Decimals Percents', 'Counting Methods', 'Circle Graph', 'Conversion of Fraction Decimals Percents', 'Stem and Leaf Plot', 'Ordering Fractions', 'Ordering Positive Decimals', 'Effect of Changing Dimensions of a Shape Prportionally', 'Addition and Subtraction Positive Decimals', 'Probability of Two Distinct Events', 'Conversion of Fraction Decimals Percents', 'Conversion of Fraction Decimals Percents', 'Circle Graph', 'Least Common Multiple', 'Table', 'Least Common Multiple', 'Ordering Positive Decimals', 'Conversion of Fraction Decimals Percents