In [7]:
import pickle
import collections
from itertools import combinations
import numpy as np
from tqdm import tqdm, tqdm_notebook
from IPython.lib import backgroundjobs as bg
from sklearn import metrics

In [8]:
with open('./data/project_authors_2016_q1_k25core.pickle', 'rb') as picklefile:
    project_authors_q1 = pickle.load(picklefile, encoding='latin1')

In [9]:
with open('./data/project_authors_2016_q2_k25core.pickle', 'rb') as picklefile:
    project_authors_q2 = pickle.load(picklefile, encoding='latin1')

In [10]:
authors = project_authors_q1['author_id'].unique()
projects = project_authors_q1['project_id'].unique()
len(authors), len(projects)

(3053, 2162)

In [11]:
neighbours = {node: set() for node in authors}

In [12]:
commit_index = collections.defaultdict(dict)
for _, row in tqdm_notebook(project_authors_q1.iterrows()):
    commit_index[row['project_id']][row['author_id']]=row['commit_times']

A Jupyter Widget




In [13]:
commit_index_q2 = collections.defaultdict(dict)
for _, row in tqdm_notebook(project_authors_q2.iterrows()):
    commit_index_q2[row['project_id']][row['author_id']]=row['commit_times']

A Jupyter Widget




In [14]:
for repo, by_author in tqdm_notebook(commit_index.items()):
    edges = combinations(by_author, 2)
    for a, b in edges:
        neighbours[a].add(b)
        neighbours[b].add(a)

A Jupyter Widget




In [15]:
edges_q1 = set()
for repo, by_author in tqdm_notebook(commit_index.items()):
    edges = combinations(by_author, 2)
    for a, b in edges:
        if a > b:
            a, b = b, a
        edges_q1.add((a, b))

A Jupyter Widget




In [16]:
edges_q2 = set()
for repo, by_author in tqdm_notebook(commit_index_q2.items()):
    edges = combinations(by_author, 2)
    for a, b in edges:
        if a > b:
            a, b = b, a
        edges_q2.add((a, b))

A Jupyter Widget




In [17]:
new_edges=edges_q2.difference(edges_q1)
k = len(new_edges)
k

26971

In [18]:
def common_neighbor(neighbours, pairs, cache):    
    for u, v in tqdm_notebook(pairs, mininterval=1):
        if u > v:
            u, v = v, u
        if (u, v) not in cache:
            cache[(u, v)] = len(neighbours[u].intersection(neighbours[v]))

In [19]:
pairs = list(combinations(authors, 2)) # convert to list for nicer progress bar

In [20]:
jobs = bg.BackgroundJobManager()

In [21]:
cn_cache = collections.defaultdict(float)
jobs.new(lambda:common_neighbor(neighbours, pairs, cn_cache))

Starting job # 0 in a separate thread.


<BackgroundJob #0: <function <lambda> at 0x11c65e400>>

A Jupyter Widget

In [None]:
with open('./data/sim_unweighted_cn.pickle', 'wb') as picklefile:
    pickle.dump(cn_cache, picklefile, protocol=2)

In [24]:
def accuracy(similarities, q1, q2):
    top = sorted([(similarities[pair], pair) for pair in similarities if pair not in q1], reverse=True)
    new_edges=q2.difference(q1)
    k = len(new_edges)
    edges = set(edge for weight, edge in top[:k])
    overlap = len(edges.intersection(new_edges))
    return overlap, overlap/k

In [25]:
accuracy(cn_cache, edges_q1, edges_q2)

(4853, 0.17993400318861)

In [None]:
def jaccard_coefficient(neighbours, pairs, cache):
    for u, v in tqdm_notebook(pairs, mininterval=1):
        if u > v:
            u, v = v, u
        if (u, v) not in cache:
            cache[(u, v)] = len(neighbours[u].intersection(neighbours[v]))/len(neighbours[u].union(neighbours[v]))

In [None]:
jc_cache = collections.defaultdict(float)
jobs.new(lambda:jaccard_coefficient(neighbours, pairs, jc_cache))

In [None]:
accuracy(jc_cache, edges_q1, edges_q2)

In [None]:
with open('./data/sim_unweighted_jc.pickle', 'wb') as picklefile:
    pickle.dump(jc_cache, picklefile, protocol=2)

In [26]:
def prefer_attach(neighbours, pairs, cache):
    for u, v in tqdm_notebook(pairs, mininterval=1):
        if u > v:
            u, v = v, u
        if (u, v) not in cache:
            cache[(u, v)] = len(neighbours[u]) * len(neighbours[v])

In [27]:
pa_cache = collections.defaultdict(float)
jobs.new(lambda:prefer_attach(neighbours, pairs, pa_cache))

Starting job # 2 in a separate thread.


<BackgroundJob #2: <function <lambda> at 0x15cd351e0>>

A Jupyter Widget




In [None]:
with open('./data/sim_unweighted_pa.pickle', 'wb') as picklefile:
    pickle.dump(pa_cache, picklefile, protocol=2)

In [28]:
accuracy(pa_cache, edges_q1, edges_q2)

(4312, 0.15987542174928626)

In [29]:
def adamic_adar(neighbours, pairs, cache):
    for u, v in tqdm_notebook(pairs, mininterval=1):
        if u > v:
            u, v = v, u
        if (u, v) not in cache:
            cn = neighbours[u].intersection(neighbours[v])
            cache[(u, v)] = sum(1/np.log(len(neighbours[z]))for z in cn)

In [30]:
aa_cache = collections.defaultdict(float)
jobs.new(lambda:adamic_adar(neighbours, pairs, aa_cache))

Starting job # 3 in a separate thread.


<BackgroundJob #3: <function <lambda> at 0x15cd35400>>

A Jupyter Widget

In [None]:
with open('./data/sim_unweighted_aa.pickle', 'wb') as picklefile:
    pickle.dump(aa_cache, picklefile, protocol=2)

In [71]:
with open('./data/sim_unweighted_aa.pickle', 'rb') as picklefile:
    aa_cache_loaded = pickle.load(picklefile)

In [68]:
len(aa_cache_loaded)

4658878

In [75]:
pairs = [(u,v) if u<v else (v,u) for u,v in combinations(authors, 2)]
truth = [edge in edges_q2 for edge in pairs]
scores = [aa_cache_loaded[edge] for edge in pairs]
fpr, tpr, thres = metrics.roc_curve(truth,scores)
metrics.auc(fpr,tpr)

0.9489987474730646

In [70]:
len(aa_cache_loaded)

6789401