In [1]:
import ujson
import numpy 
import datetime

# Load Data

In [None]:
summaries = {}
ctr = 0
for line in open('user_project_summaries.json'): # lazy iteration because the file is large
    print ctr,
    ctr+=1
    summaries.update(ujson.loads(line))

0 1 2 3 4 5 6 7 8 9 10

## Language

Create Language Data Set and Initialize Language Codes

In [None]:
'''We are using all the data from the user from whom we we have previously 
classfied what their primary langauge is '''

isocodes = ujson.load(open('isocodes.json'))  # mapping from  iso code to language name
user_langs = ujson.load(open('user_inferredlangs.json'))  # mapping from userid to inferred language
#remove Latin because it's not a reliable inference
#TODO for later: get next-best language besides Latin in the inference function
# remove uncommon languages
lang_counts = {}
for user in user_langs:
    lang = user_langs[user]
    lang_counts[lang] = lang_counts.get(lang, 0) + 1
        
user_langs = {user: lang for user, lang in user_langs.items() if lang!='la' and lang_counts[lang]>=100}

langset = sorted(list(set(user_langs.values())))
lang_num_map = {lang: i for i, lang in enumerate(langset)} #key is langauge isocode, value is number 
print 'lang data created with', len(user_langs), 'users and', len(langset), 'languages'

In [None]:
# time utilities
def convert_time(el):
    """Human readable time"""
    el = int(str(el)[:10])
    return datetime.date.fromtimestamp(el)

def timediff(t1, t2):
    """difference between times (millisec precision) as days"""
    return (t1-t2)/(86400.*1000)

In [None]:
earliest = 1e100
latest = 0

ctr = 0
for user in summaries:
    for project in summaries[user]:
        ctime = summaries[user][project]["**created"]
        if ctime < earliest:
            earliest = ctime
        if ctime > latest:
            latest = ctime
            
    ctr+=1
    if ctr%1000==0:
        print ctr/1000,
            
print
print 'Earliest:', convert_time(earliest) 
print 'Latest:', convert_time(latest)
MAXDUR = 300
MINDUR = 150
# get subset of data from "old" users whose earliest creation date is more than MAXDUR days before the end of the dataset
# (giving them a good chance to stay on for more than MINDUR days),
# AND have activity within MAXDUR days if they are active beyond MINDUR (to be fair to recent users)

old_users = set() 

ctr = 0
for user in summaries:
    ctimes = [project["**created"] for project in summaries[user].values()]
    if ctimes==[]:
        print user, 'has no projects'
        continue
    start_time = min(ctimes)
    if timediff(latest, start_time)>MAXDUR:  # first restrict to old users
        # creation times since start date of user
        ctimes_from_start = [timediff(project["**created"], start_time) for project in summaries[user].values()]  
        # projects created at least MINDUR days after start
        post_mindur = [ctime for ctime in ctimes_from_start if ctime>MINDUR]
        # consider users who either were not active after MINDUR days, or had activity between MINDUR and MAXDUR days
        if post_mindur==[] or len(filter(lambda length: length<=MAXDUR, post_mindur))>0:
            old_users.add(user)
            
    ctr+=1
    if ctr%1000==0:
        print ctr/1000,
        
print
print 'Filtered to', len(old_users), 'users from', len(summaries)

# Features

In [None]:
from features import *
import features
reload(features)
import sys, os
sys.path.append(os.getcwd())

## Time Features

Get moments from list of project lengths and intervals.

In [None]:
def projectLengthFeatures(projects):
    """moments of project lengths and intervals, as well as number of projects"""
    lengths = getProjectLengths(projects)
    intervals = getProjectIntervals(projects)
    
    #note: feature names are for our reference only
    userDict = {} 
    userDict["mean of lengths"] = numpy.mean(lengths)
    userDict["stddev of lengths"] = numpy.std(lengths)
    userDict["mean of intervals"] = numpy.mean(lengths)
    userDict["stddev of intervals"] = numpy.std(lengths)
    
    userDict["num projects"] = numProjects(projects)
    
    return userDict

Get weekday distibution features.

In [None]:
def dayAnalysisFeatures(projects):
    """number of projects on each day of the week, and the percentage of them on a weekday"""
    byday = numOnDay(projects)

    userDict = {day: byday[i] for i, day in enumerate(["Monday", 
                                                       "Tuesday", 
                                                       "Wednesday", 
                                                       "Thursday", 
                                                       "Friday", 
                                                       "Saturday", 
                                                       "Sunday"])}
    return userDict

Histogram of user's projects in the period.

In [None]:
def decileProjects(projects):
    numbins = 10
    hist = projectsPerUserPeriod(projects, bins=numbins)
    
    userDict = {'decile '+str(i+1): hist[i] for i in range(numbins)}
    return userDict

Here's a handy function to combine different feature groups.

In [None]:
def combine_featfuncs(funclist):
    def combined(user):
        basedict = funclist[0](user)
        for f in funclist[1:]:
            basedict.update(f(user))
        return basedict
    return combined

## Code Features


In [None]:
def summaryOBlockDecile(projects): 
    userDict = {}
    decileOrphanBlock = decileOrphanBlocks(projects)
    userDict["O 1"] = decileOrphanBlock[0]
    userDict["O 2"] = decileOrphanBlock[1]
    userDict["O 3"] = decileOrphanBlock[2]
    userDict["O 4"] = decileOrphanBlock[3]
    userDict["O 5"] = decileOrphanBlock[4]
    userDict["O 6"] = decileOrphanBlock[5]
    userDict["O 7"] = decileOrphanBlock[6]
    userDict["O 8"] = decileOrphanBlock[7]
    userDict["O 9"] = decileOrphanBlock[8]
    userDict["O 10"] = decileOrphanBlock[9]
    return userDict

In [None]:
def summaryDecileTLBlocks(projects): 
    userDict = {}
    decileTL = decileTypesTopLevelBlocks(projects)
    userDict["TL 1"] = decileTypesTopLevelBlocks[0]
    userDict["TL 2"] = decileTypesTopLevelBlocks[1]
    userDict["TL 3"] = decileTypesTopLevelBlocks[2]
    userDict["TL 4"] = decileTypesTopLevelBlocks[3]
    userDict["TL 5"] = decileTypesTopLevelBlocks[4]
    userDict["TL 6"] = decileTypesTopLevelBlocks[5]
    userDict["TL 7"] = decileTypesTopLevelBlocks[6]
    userDict["TL 8"] = decileTypesTopLevelBlocks[7]
    userDict["TL 9"] = decileTypesTopLevelBlocks[8]
    userDict["TL 10"] = decileTypesTopLevelBlocks[9]
    return userDict

In [None]:
def summaryDecileNumScreens(projects): 
    userDict = {}
    decileNumS = decileNumScreens(projects)
    userDict["NS 1"] = decileNumS[0]
    userDict["NS 2"] = decileNumS[1]
    userDict["NS 3"] = decileNumS[2]
    userDict["NS 4"] = decileNumS[3]
    userDict["NS 5"] = decileNumS[4]
    userDict["NS 6"] = decileNumS[5]
    userDict["NS 7"] = decileNumS[6]
    userDict["NS 8"] = decileNumS[7]
    userDict["NS 9"] = decileNumS[8]
    userDict["NS 10"] = decileNumS[9]
    return userDict

In [None]:
def summaryAverages(projects):
    userDict = {} 
    userDict["NS"] = averageNumScreens(projects)
    userDict["NB"] = averageNumBlocks(projects)
    userDict["OB"] = getAverageOrphanBlocks(projects)
    userDict["TL"] = getAverageTypeTLBlocks(projects)
    userDict["TL2"] = getAverageNumTLBlocks(projects)
    userDict["NC"] = averageNumComponents(projects)
    userDict["MC"] = aveNumMediaAssets(projects)
    varList = getAllVariables(projects)
    
    userDict["local vars"] = varList[0]
    userDict["global vars"] = varList[1]
    
    return userDict

# Classification

Get list of projects for each user.

In [None]:
def getAllProjects(userID): #from summaries
    """list of projects sorted by creation times"""
    projectlist = summaries[userID].values()
    return sorted(projectlist,
                  key=lambda project: project['**created']) 

def projectsInMindur(projectlist):
    """filter projects within user's MINDUR duration from earliest"""
    earliest = projectlist[0]['**created']
    return [project for project in projectlist if timediff(project['**created'], earliest)<=MINDUR]

def userDuration(projectlist):
    """get the duration (difference between last and earliest creation dates)"""
    return timediff(projectlist[-1]['**created'], projectlist[0]['**created'])

def getProjects(userID, task):
    projectlist = getAllProjects(userID)
    if task == 'LANGUAGE': 
        return projectlist
    if task == 'RETENTION':
        return projectsInMindur(projectlist)

Use the appropriate set of users for each task with labels.

In [None]:
def get_user_labels(task):
    if task == 'RETENTION': 
        return {user: int(userDuration(getAllProjects(user))>MINDUR) for user in old_users}
    if task == 'LANGUAGE':
        return {user: lang_num_map[lang] for user, lang in user_langs.items()}

### Featurizing

In [None]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction import DictVectorizer

all_features = []
time_features = [] 
code_features = []

TASK = 'RETENTION'
userlabels = get_user_labels(TASK)

y = []
for user in userlabels:
    projects = getProjects(user, TASK)  # assume projects are sorted by creation date

    y.append(userlabels[user])
    
    time_combined = combine_featfuncs([projectLengthFeatures, dayAnalysisFeatures, decileProjects])
    #code_combined = combine_featfuncs([summaryDecileNumScreens,summaryOBlockDecile,summaryDecileTLBlocks,summaryAverages]) 
    
    time_features.append(time_combined(projects))
    #code_features.append(code_combined(projects))
    
y = numpy.array(y)
vec = DictVectorizer()
Xtime = vec.fit_transform(time_features) 
#Xcode = vec.fit_transform(code_features)

### Model and Prediction

In [None]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from  sklearn.metrics import f1_score 
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import plotcnf
reload(plotcnf)
from plotcnf import plot_confusion_matrix

percentpos = sum(y)/float(len(y))
chance = max(percentpos, 1-percentpos)
print 'Chance is', chance

foldindices = StratifiedKFold(y)
k = 70 #num neighbors for kNN

if TASK=='LANGUAGE':
    labels = map(lambda x:x[0], sorted(lang_num_map.items(), key=lambda x:x[1]))  
    # languages corresponding to class labels as a list
elif TASK=='RETENTION':
    labels = ['left', 'survived']
            
for featname, X in [('time', Xtime)]:
    print  'Building a model with', X.shape[1], featname, 'features for', TASK
    for trainidx, testidx in foldindices:
        print 'Fold'
        
        ytrain = y[trainidx]
        ytest = y[testidx]
        
        Xtrain = X[trainidx, :]  # using numpy's smart indexing
        Xtest = X[testidx, :]
        
        if TASK=='RETENTION':  #kNN too slow on the larger dataset
            KNeighbors_model = KNeighborsClassifier(n_neighbors=k)
            KNeighbors_model.fit(Xtrain, ytrain)
            KNN_predictions = KNeighbors_model.predict(Xtest)
            
            print 'with', k, 'neighbors' 
            print "KNN accuracy: ", KNeighbors_model.score(Xtest, ytest)
            print 'KNN f1 score', f1_score(ytest, KNN_predictions)
        
        
        logistic_model = linear_model.LogisticRegression()
        logistic_model.fit(Xtrain, ytrain)
        log_predictions = logistic_model.predict(Xtest)
        print 'Logistic regression accuracy: ', logistic_model.score(Xtest, ytest)
        print 'Logistic regression f1 score', f1_score(ytest, log_predictions) 
        
        cnf_matrix = confusion_matrix(ytest, log_predictions)  
        plot_confusion_matrix(cnf_matrix, labels)
        plt.show()