In [41]:
import ujson
import numpy 
import datetime
from collections import *
import json

## 1. Load Data

Load summary data of all users.

In [2]:
summaries = {}
ctr = 0
for line in open('user_project_summaries.json'): # lazy iteration because the file is large
    ctr += 1
    summaries.update(ujson.loads(line))  
    print ctr,

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47


The languages labels are inferred from the `namestrings_to_langs.py` script using `langid`.

In [3]:
isocodes = ujson.load(open('isocodes.json'))  # mapping from  iso code to language name
user_langs = ujson.load(open('user_inferredlangs.json'))  # mapping from userid to inferred language
# remove uncommon languages
lang_counts = Counter()
for user in user_langs:
    lang = user_langs[user]
    lang_counts[lang] += 1
    
user_langs = {user: lang for user, lang in user_langs.items() if lang_counts[lang]>=250 and user in summaries}
langset = set(user_langs.values())
print 'lang data created with', len(user_langs), 'users and', len(langset), 'languages', langset

lang data created with 43410 users and 12 languages set([u'el', u'fr', u'en', u'zh', u'pt', u'la', u'ca', u'de', u'ko', u'it', u'th', u'es'])


Only keep users with language labels.

In [4]:
summaries = {user: summaries[user] for user in summaries if user in user_langs}
print 'Filtered to', len(summaries), 'users'

Filtered to 43410 users


Creates a dictionary, with each language as a key and a list of users who use that language as the value

In [9]:
lang_dict = {}
for user in summaries: 
    lang = user_langs[user]
    if lang not in lang_dict: 
        lang_dict[lang] = []
    lang_dict[lang].append(summaries[user])

### Experiments with Chinese users

In [21]:
# chin_users = lang_dict['zh']
# c = Counter()
# for summary in chin_users: 
#     for project in summary: 
#         c.update([summary[project]['**Project Name']])

#print c.most_common(100)        

Out of the 151351 unqiue project names 3582 contain the prefix 'ex_' most likely indicating a series of tutorials

In [25]:
chinese_examples = [entry for entry in c if 'ex_' in entry]

In [26]:
print len(chinese_examples)

3582


## 2. Featurize

In [5]:
from features import *
import features
reload(features)
import sys, os
sys.path.append(os.getcwd())


In [6]:
# time utilities
def convert_time(el):
    """Human readable time"""
    el = int(str(el)[:10])
    return datetime.date.fromtimestamp(el)

def timediff(t1, t2):
    """difference between times (millisec precision) as days"""
    return (t1-t2)/(86400.*1000)

### Filter Tutorials

In [7]:
filter_tutorials = True

In [9]:
if filter_tutorials: 
    not_tutorials = {}

    for i in range(10): 
        print i, 
        notT_list = ujson.load(open('tutorial_comparisons/nottutorials_' + str(i) +'.json'))
        for entry in notT_list: 
            if entry[0] not in summaries:
                continue
            proj_name = entry[1].split("_summary.json")[0]
#             if entry[0] not in not_tutorials: 
                not_tutorials[entry[0]] = {}
            try: 
                not_tutorials[entry[0]][proj_name] = summaries[entry[0]][proj_name]
            except KeyError: 
                pass

0 1 2 3 4 5 6 7 8 9


# Featurizers

## Setup

In [10]:
def combine_featfuncs(funclist):
    def combined(user):
        basedict = funclist[0](user)
        for f in funclist[1:]:
            basedict.update(f(user))
        return basedict
    return combined

In [39]:
def getAllProjects(userID, no_tutorials): 
    """list of projects sorted by creation times"""
    if (no_tutorials):
        try: 
            projectlist = not_tutorials[userID].values()
        except:
            print userID
    else: 
        projectlist = summaries[userID].values()
    return sorted(projectlist,
                  key=lambda project: project['**created']) 

def userDuration(projectlist):
    """get the duration (difference between last and earliest creation dates)"""
    return timediff(projectlist[-1]['**created'], projectlist[0]['**created'])


In [11]:
def getAllProjects(userID, filter_tutorials=False): 
    """list of projects sorted by creation times"""
    projectlist = summaries[userID].values()
    if filter_tutorials:
        projectlist = [projectname for projectname in projectlist if projectname not in not_tutorials]
    return sorted(projectlist,
                  key=lambda project: project['**created']) 

def userDuration(projectlist):
    """get the duration (difference between last and earliest creation dates)"""
    return timediff(projectlist[-1]['**created'], projectlist[0]['**created'])

In [56]:
langgroups = {'es': 'eur',
              'pt': 'eur', 
              'it': 'eur', 
              'de': 'eur', 
              'fr': 'eur', 
              'ca': 'eur', 
              'el': 'eur', 
              'nl': 'eur', 
              'pl': 'eur',
              'en': 'eur',
              'ru': 'eur',
              'ko': 'asia', 
              'zh': 'asia', 
              'ja': 'asia', 
              'th': 'asia'}
langgroup_num_map = {'eur': 0, 'asia': 1}





In [13]:
def get_user_labels():
    return {user: langgroups[lang] for user, lang in user_langs.items() if lang in langgroups}

## Time featurizer

In [14]:
def projectLengthFeatures(projects):
    """moments of project lengths and intervals for a user, as well as number of projects"""
    lengths = getProjectLengths(projects)
    intervals = getProjectIntervals(projects)
    
    userDict = {} 
    userDict["mean of lengths"] = numpy.mean(lengths)
    userDict["stddev of lengths"] = numpy.std(lengths)
    userDict["mean of intervals"] = numpy.mean(lengths)
    userDict["stddev of intervals"] = numpy.std(lengths)
    
    userDict["num projects"] = numProjects(projects)
    
    return userDict

In [15]:
def dayAnalysisFeatures(projects):
    """number of projects on each day of the week, and the proportion of them on a weekday"""
    byday = numOnDay(projects)

    userDict = {day: byday[i] for i, day in enumerate(["Monday", 
                                                       "Tuesday", 
                                                       "Wednesday", 
                                                       "Thursday", 
                                                       "Friday", 
                                                       "Saturday", 
                                                       "Sunday"])}
    return userDict

In [16]:
def decileProjects(projects):
    numbins = 10
    hist = projectsPerUserPeriod(projects, bins=numbins)
    
    userDict = {'decile '+str(i+1): hist[i] for i in range(numbins)}
    return userDict

## Code Features


In [17]:
def deltaDeciles(decileDict, name):
    values = sortDeciles(decileDict)
    deltas = {}
    for i in range(10): 
        if i== 0: pass
        else: deltas[str(i) + " " + name] = values[i]-values[i-1]
    return deltas


def sortDeciles(decileDict):
    deciles_sorted = []
    keys = decileDict.keys()
    for key in keys: 
        deciles_sorted.insert(int(key[-1]), decileDict[key])
    return deciles_sorted



In [18]:
def summaryOBlockDecile(projects): 
    hist = decileOrphanBlocks(projects)
    userDict = {'Orphan Decile '+str(i+1): hist[i] for i in range(10)}
    return userDict

In [19]:
def summaryDecileTLBlocks(projects): 
    hist = decileTypesTopLevelBlocks(projects)
    userDict = {'TopLevel Decile '+str(i+1): hist[i] for i in range(10)}
    return userDict

In [20]:
def summaryDecileNumScreens(projects): 
    hist = decileNumScreens(projects)
    userDict = {'NumScreens Decile '+str(i+1): hist[i] for i in range(10)}
    return userDict

In [21]:
def summaryDeltasTLBlocks(projects): 
    tlblocks = summaryDecileTLBlocks(projects)
    return deltaDeciles(tlblocks, "TL blocks delta")

In [22]:
def summaryDeltasOBlock(projects): 
    oblock = summaryOBlockDecile(projects)
    return deltaDeciles(oblock,"O blocks delta")

In [23]:
def summaryDeltasNumScreens(projects): 
    numscreens = summaryDecileNumScreens(projects)
    return deltaDeciles(numscreens, "numscreens blocks delta")

In [24]:
def summaryAverages(projects):
    userDict = {} 
    userDict["mean of numScreens"] = numpy.mean(getNumScreens(projects))
    userDict["NB"] = averageNumBlocks(projects)
    userDict["OB"] = getAverageOrphanBlocks(projects)
    
    userDict["TL"] = getAverageTypeTLBlocks(projects)
    userDict["TL2"] = getAverageNumTLBlocks(projects)
    
    userDict["NC"] = averageNumComponents(projects)
    userDict["NTC"] = averageNumTypeComponents(projects)

    userDict["MC"] = aveNumMediaAssets(projects)
    
    userDict["NP"] = averageNumProcedures(projects)
    userDict["NS"] = averageNumStrings(projects) 
        
    varList = getAllVariables(projects)
    userDict["local vars"] = varList[0]
    userDict["global vars"] = varList[1]
 


    return userDict

In [25]:
def classesFeaturizers(projects): 
    userDict = {} 
    classes = getClasses(projects)
    cList = ['TableArrangement', 'DatePicker', 'Canvas', 
             'CheckBox', 'Web', 'Clock', 'BluetoothServer', 
             'ActivityStarter', 'Texting', 'Label', 'Spinner', 
             'Camera', 'BluetoothClient', 'PhoneCall', 'LocationSensor', 
             'VerticalArrangement', 'HorizontalArrangement', 'Sharing', 
             'TextToSpeech', 'GoogleMap', 'Slider', 'OrientationSensor', 
             'ListView', 'PhoneNumberPicker', 'TinyDB', 'NxtDirectCommands', 
             'Sound', 'ListPicker', 'SpeechRecognizer', 'Button', 'WebViewer',
             'BarcodeScanner', 'NxtDrive', 'Camcorder', 'Notifier', 'TextBox',
             'AccelerometerSensor', 'Image', 'VideoPlayer', 'TinyWebDB',
             'Player', 'File', 'YandexTranslate']
   
    for key in cList: 
        userDict[key] = classes[key]
        
    return userDict

In [48]:
def allBlocksFeaturizer(projects):
    userDict = {}
    with open('top_500_blocks.json') as data_file:    
        block_list = json.load(data_file)

    all_blocks_dict = {block:0 for block in block_list}
    all_blocks_dict = getBlocks(projects, all_blocks_dict) 
    
    for key in block_list: 
        userDict[key] = all_blocks_dict[key]
    return userDict

In [27]:
# blocks = set() 
# users = summaries.keys()
# c = Counter()

In [28]:
# j = 0
# for userID in users: 
#     i = 0
#     userBlocks = set() 

#     projects = getAllProjects(userID, False)
#     screenNames = getScreenNames(projects)

#     while i < len(projects):
#         for screenName in screenNames[i]:
#                 if screenName in projects[i] and 'Active Blocks' in projects[i][screenName]['Blocks'] and 'Types' in projects[i][screenName]['Blocks']['Active Blocks']:
#                     b = projects[i][screenName]['Blocks']['Active Blocks']['Types']
#                     for block in b: 
#                         if block not in userBlocks: 
#                             userBlocks.add(block)
#         i+=1
    
#     for block in userBlocks: 
#         if block not in blocks: 
#             if '.' in block: 
#                 b = block.split('.')[0]
#             else: 
#                 b = block
#             c[b] = 1
#         else: 
#             c[b] +=1
              
#     j+=1 
#     if j % 1000 == 0: 
#         print j,

In [29]:
# with open('altered_blocks.json', 'w') as data_file:
#     json.dump(c.keys(),data_file)

In [30]:
# category_dict = {} 
# for block in c: 
#     if '.' in block: 
#         key = block.split('.')[0]
#         if key not in category_dict:
#             category_dict[key] = []
#         category_dict[key].append(block)
#     elif '_' in block: 
#         key = block.split('_')[0]
#         if key not in category_dict: 
#             category_dict[key] = []
#         category_dict[key].append(block)
#     else: 
#         if 'other' not in category_dict:
#             category_dict['other'] = []
#         category_dict['other'].append(block)


In [31]:
# print category_dict['lexical']

In [32]:
# with open('blocks_by_category.json', 'w') as data_file:
#     json.dump(category_dict, data_file)

In [33]:
from collections import defaultdict
from collections import Counter

In [34]:
# c = Counter(l)

In [35]:
# i = 0
# for userID in users: 
#     projects = getAllProjects(userID, False)
#     getBlocks(projects, all_blocks_dict)
#     if i %1000 == 0: 
#         print i,
#     i+=1

In [49]:
time_combined = combine_featfuncs([projectLengthFeatures, dayAnalysisFeatures,decileProjects])
code_combined = combine_featfuncs([summaryOBlockDecile,summaryDecileTLBlocks,summaryDecileNumScreens,summaryDeltasTLBlocks,summaryDeltasOBlock,summaryDeltasNumScreens, summaryAverages, allBlocksFeaturizer])  

In [50]:
from sklearn.feature_extraction import DictVectorizer

all_features = []
time_features = [] 
code_features = []

userlabels = get_user_labels()

y = []
ctr = 0
for user in userlabels:
    if user in not_tutorials: 

        projects = getAllProjects(user, filter_tutorials)  # this returns projects sorted by creation date that do notinclude tutorials 

        y.append(userlabels[user])

        time_features.append(time_combined(projects))
        code_features.append(code_combined(projects))


        # merge time and code feature dicts
        all_features_user = time_features[-1].copy()
        all_features_user.update(code_features[-1])
        all_features.append(all_features_user)

        ctr+=1
        if ctr%1000==0:
            print ctr/1000,

        


y = numpy.array(y)

timevec = DictVectorizer()

Xtime = timevec.fit_transform(time_features, y) 

codevec = DictVectorizer()

Xcode = codevec.fit_transform(code_features, y)

allvec = DictVectorizer()

Xall = allvec.fit_transform(all_features, y)



  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42


In [51]:
from sklearn.preprocessing import scale
from numpy import *
#convert to dense matrices since these are dense anyway
Xtime = scale(Xtime.toarray())
Xcode = scale(Xcode.toarray())
Xall = scale(Xall.toarray())

#### Static KFold Values

In [52]:
import pickle

In [53]:
if filter_tutorials:
    with open('filtered_kfold.pickle', 'rb') as f:
        foldindices = pickle.load(f)
else:
    with open('unfiltered_kfold.pickle', 'rb') as f:
        foldindices = pickle.load(f)

### Model Training and Testing

In [57]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from  sklearn.metrics import f1_score 
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline 
from plotcnf import plot_confusion_matrix

# print 'Chance is', max(numpy.bincount(y))/float(len(y))

### added later ### ************************    
#foldindices = StratifiedKFold(y)


labels = map(lambda x:x[0], sorted(langgroup_num_map.items(), key=lambda x:x[1]))  
# languages corresponding to class labels as a list

    
k = 70 #num neighbors for kNN

classifiers = [('logreg', LogisticRegression())]  # no knn because it's too slow 
#multi_class='multinomial', 
   

for modelname, model in classifiers:
    #for featname, X in [('code', Xcode), ('all', Xall)]:
    for featname, X in [('time', Xtime), ('code', Xcode), ('all', Xall)]:
        print  'Building a', modelname, 'model with', X.shape[1], featname
    
        cvaccs = numpy.zeros(len(foldindices))
        #cvf1 = numpy.zeros(len(foldindices))
        for i, (trainidx, testidx) in enumerate(foldindices):
            print 'Fold', i+1
        
            ytrain = y[trainidx]
            ytest = y[testidx]
        
            Xtrain = X[trainidx, :]  
            Xtest = X[testidx, :]
            
            pca = PCA(n_components=300)

            reduced_train = pca.fit_transform(Xtrain)
            reduced_test = pca.transform(Xtest)
            
            model.fit(reduced_train, reduced_test)
            cvaccs[i] = model.score(Xtest, ytest)
        
            predictions = model.predict(reduced_test)
            #cvf1[i] = f1_score(ytest, predictions)
            
            
            
            #print '****', modelname, cvaccs[i]
        #analyze last fold only
        cnf_matrix = confusion_matrix(ytest, predictions)  

        plot_confusion_matrix(cnf_matrix, labels)
        plt.show()
        coef = model.coef_
        print 'Average accuracy', numpy.mean(cvaccs)


Building a logreg model with 22 time
Fold 1


IndexError: index 42490 is out of bounds for axis 1 with size 42490

### Coefs 

In [None]:
sorted(zip(allvec.get_feature_names(), coef[0]), key=lambda tup: tup[1])

In [None]:
from sklearn.metrics import f1_score
f1_score(ytest, predictions)


### PCA

### changing get Blocks and normalization 

In [None]:
def normalizeDict(d, projects): 
    '''normalizes dictionary by # of projects'''
    np = numProjects(projects)
    if float(np) == 0: np == 1
    for entry in d: 
        try: 
            d[entry] = d[entry] / float(np)
        except ZeroDivisionError: 
            print entry, np
    return numpy.nan_to_num(d)

In [None]:
def getBlocks2(projects, block_dict): 
    '''returns  a dictionary with the key as a block name ie math_add 
    and a value that is the # of occurances of that block / numProjects'''
    screenNames = getScreenNames(projects)
    i= 0
    while i < len(projects):
        for screenName in screenNames[i]:
                if screenName in projects[i] and 'Active Blocks' in projects[i][screenName]['Blocks'] and 'Types' in projects[i][screenName]['Blocks']['Active Blocks']:
                    blocks = projects[i][screenName]['Blocks']['Active Blocks']['Types']
                    for block in blocks: 
                        if block in block_dict: 
                            block_dict[block] += projects[i][screenName]['Blocks']['Active Blocks']['Types'][block]
                        try: 
                            if block.split(".")[0] in block_dict:
                                block_dict[block.split(".")[0]] += projects[i][screenName]['Blocks']['Active Blocks']['Types'][block]
                        except Error: 
                            pass

        i+=1
    final_dict = normalizeDict(block_dict, projects)