In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from gensim.models import Word2Vec
from sklearn.manifold import TSNE 

In [None]:
#Read in skill builder dataset
filename = 'skill_builder_data_corrected.csv'
df = pd.read_csv(filename, encoding='ISO-8859-1', low_memory=False)

In [None]:
#Read in problem text dataset
filename2='../data/problems.csv'
problems=pd.read_csv(filename2, encoding='ISO-8859-1', low_memory=False)

In [None]:
#Select students that have attempted more than n problems 
students_list=df.groupby('user_id').problem_id.count()
students_id=students_list[students_list>50].index #Get the associated user_id
df2=df[df['user_id'].isin(students_id)] #Select only the rows containing those students

In [None]:
#Merge the two datasets based on the problem_id and assistment_id
#Each assistment_id can have multiple problem_id's
#But each problem_id appears to only be associated with 1 assistment_id
df3=pd.merge(df2,problems,on=['assistment_id','problem_id'],how='left',indicator=True)
#Will still include rows where there is no corresponding problem description for the problem_id
df3.shape

In [None]:
#Different way to merge
df4=pd.merge(df2,problems,on=['assistment_id','problem_id'],how='inner')
#Will cut out rows where there is no corresponding problem description for the problem_id
#(This amounts to 120 rows and 23 unique problem_id's.)
df4.shape

In [None]:
#Count the ones without problem text
nondescript=df3[df3['_merge']=='left_only']['problem_id']
print('Number of rows without description: ', nondescript.size)
print('Number of unique problems without description: ', nondescript.nunique())

In [None]:
#Make sure each problem_id is only associated with 1 unique assistment_id
any(df3.groupby('problem_id').assistment_id.nunique()>1)
#Woo! No problem is associated with more than one assistment_id

In [None]:
#Random queries
df[df['problem_id']==58551]['assistment_id'].nunique()
df[df['assistment_id']==76958]['problem_id'].nunique()

In [None]:
#Number of unique labeled skills (107)
df3.skill_name.nunique()

In [None]:
problems.columns

In [None]:
df.groupby(['user_id','problem_id'])['skill_id'].nunique()

In [None]:
df[(df.problem_id==57647) & (df.user_id==14)]['position']
#Multiple rows for a particular problem with multiple skills are the same (except for the skill info)

In [None]:
df[df['original']==1].problem_id.nunique()

In [None]:
df.groupby(['user_id','problem_id']).size()

In [None]:
df.problem_id.nunique()