In [23]:
import pickle
import numpy as np
import scipy.sparse as sparse
import math

In [3]:
def union(xs):
    s = set()
    for x in xs:
        s = s | x
    return s

In [4]:
def get_words(wordset, versions = None):
    if versions == None:
        versions = wordset.keys()
    w = set()
    for v in versions:
        w = w | union(wordset[v])
    return sorted(list(w))

In [5]:
def count_words(wordset, versions = None):
    if versions == None:
        versions = wordset.keys()
    h = {}
    for v in versions:
        for x in union(wordset[v]):
            h[x] = h.get(x, 0) + 1
    return h

In [6]:
def prob_words(wordset, versions = None):
    if versions == None:
        versions = wordset.keys()
    n = len(versions)
    h = count_words(wordset, versions=versions)
    p = {k:v/n for k,v in h.items()}
    v = {k:v*(1-v) for k,v in p.items()}
    return p,v

In [7]:
def get_files(commit_set, versions = None):
    if versions == None:
        versions = commit_set.keys()
    s = set()
    for v in versions:
        s = s | commit_set[v]
    return sorted(list(s))

In [8]:
def count_files(commit_set, versions = None):
    if versions == None:
        versions = commit_set.keys()
    h = {}
    for v in versions:
        for x in commit_set[v]:
            h[x] = h.get(x, 0) + 1
    return h

In [9]:
def prob_files(commit_set, versions = None):
    if versions == None:
        versions = commit_set.keys()
    n = len(versions)
    h = count_files(commit_set, versions=versions)
    p = {k:v/n for k,v in h.items()}
    v = {k:v*(1-v) for k,v in p.items()}
    return p,v

In [10]:
def count_files_words(commit_set, wordset, versions = None):
    if versions == None:
        versions = commit_set.keys()
    n = len(versions)
    h = {}
    for v in versions:
        for t in [(s,w) for s in commit_set[v] for w in union(wordset[v])]:
            h[t] = h.get(t, 0) + 1
    return h    

In [11]:
def cov_files_words(commit_set, wordset, ps, vs, pw, vw, versions = None):
    if versions == None:
        versions = commit_set.keys()
    n = len(versions)
    h = count_files_words(commit_set, wordset, versions=versions)
    p = {k: (v/n - ps[k[0]] * pw[k[1]])/(math.sqrt(vs[k[0]]) * math.sqrt(vw[k[1]])) for k,v in h.items() if v/n - ps[k[0]] * pw[k[1]] != 0}
    return p

In [17]:
with open('commit_set.bin', 'rb') as f:
    commit_set = pickle.load(f) # load commit_set

In [21]:
with open('word_set.bin', 'rb') as f:
    word_set = pickle.load(f) # load wordset

In [18]:
versions = commit_set.keys()

In [24]:
cs = count_files(commit_set, versions=versions)
cw = count_words(word_set, versions=versions)
ps, vs = prob_files(commit_set, versions=versions)
pw, vw = prob_words(word_set, versions=versions)
rho = cov_files_words(commit_set, word_set, ps, vs, pw, vw, versions=versions)

In [26]:
flist = get_files(commit_set, versions=versions)
wlist = get_words(word_set, versions=versions)
rowi = {x:i for i,x in enumerate(flist)}
coli = {x:i for i,x in enumerate(wlist)}

In [27]:
elem = [(rowi[k[0]], coli[k[1]], v)for k,v in rho.items()]
m = ([x[2] for x in elem], ([x[0] for x in elem], [x[1] for x in elem]))
X = sparse.coo_matrix(m, shape=(len(rowi), len(coli))).toarray()

In [28]:
def get_wordlist(fname, X, rowi, coli, flist, wlist, ranks=10):
    print(fname)
    i = rowi[fname]
    r = sorted(zip(wlist, X[i,:]), key=lambda x:abs(x[1]), reverse=True)
    return r[0:ranks]

In [38]:
def get_filelist(wname, X, rowi, coli, flist, wlist, ranks=10):
    print(wname)
    j = coli[wname]
    r = sorted(zip(flist, X[:,j]), key=lambda x:abs(x[1]), reverse=True)
    return r[0:ranks]

In [49]:
get_wordlist('java/org/apache/catalina/ha/session/BackupManager.java', X, rowi, coli, flist, wlist)

java/org/apache/catalina/ha/session/BackupManager.java


[('subject', 0.9999999999999998),
 ('princip', 0.8551315688243535),
 ('corsfilt', 0.8064778385455118),
 ('equ', 0.8064778385455118),
 ('namespac', 0.8064778385455118),
 ('classnotfoundexceiv', 0.6416666666666665),
 ('jaasrealm', 0.6416666666666665),
 ('reply', 0.6416666666666665),
 ('spac', 0.6416666666666665),
 ('sur', 0.6416666666666665)]

In [44]:
get_filelist(wlist[1000], X, rowi, coli, flist, wlist)

nsi


[('test/org/apache/catalina/core/TestApplicationMapping.java',
  0.6914090450089236),
 ('java/org/apache/catalina/core/ApplicationMapping.java', 0.6800735254367721),
 ('java/org/apache/catalina/servlet4preview/http/PushBuilder.java',
  0.6800735254367721),
 ('java/org/apache/catalina/connector/RequestFacade.java', 0.5642226451917659),
 ('java/org/apache/tomcat/util/scan/AbstractInputStreamJar.java',
  0.5642226451917659),
 ('test/org/apache/tomcat/util/http/TestCookieProcessorGeneration.java',
  0.5642226451917659),
 ('java/org/apache/catalina/core/ApplicationPushBuilder.java',
  0.5496251468844944),
 ('java/org/apache/catalina/core/StandardWrapperValve.java',
  0.5484642268462742),
 ('java/org/apache/catalina/servlet4preview/AsyncContext.java',
  0.5484642268462742),
 ('java/org/apache/catalina/servlet4preview/GenericFilter.java',
  0.5484642268462742)]

In [46]:
flist

['BUILDING.txt',
 'CONTRIBUTING.md',
 'KEYS',
 'LICENSE',
 'MERGE.txt',
 'NOTICE',
 'README.md',
 'RELEASE-NOTES',
 'RUNNING.txt',
 'SVN-MERGE.txt',
 'bin/catalina-tasks.xml',
 'bin/catalina.bat',
 'bin/catalina.sh',
 'bin/ciphers.bat',
 'bin/ciphers.sh',
 'bin/daemon.sh',
 'bin/service.bat',
 'bin/setclasspath.bat',
 'bin/setclasspath.sh',
 'bin/tool-wrapper.bat',
 'bin/tool-wrapper.sh',
 'build.properties.default',
 'build.xml',
 'conf/catalina.policy',
 'conf/catalina.properties',
 'conf/context.xml',
 'conf/jaspic-providers.xml',
 'conf/jaspic-providers.xsd',
 'conf/logging.properties',
 'conf/server.xml',
 'conf/tomcat-users.xml',
 'conf/tomcat-users.xsd',
 'conf/web.xml',
 'java/javax/annotation/Generated.java',
 'java/javax/annotation/ManagedBean.java',
 'java/javax/annotation/PostConstruct.java',
 'java/javax/annotation/PreDestroy.java',
 'java/javax/annotation/Priority.java',
 'java/javax/annotation/Resources.java',
 'java/javax/annotation/security/DeclareRoles.java',
 'java/j