In [10]:
import pickle
import collections
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from itertools import combinations

%matplotlib inline

In [3]:
with open('./data/2016_q1_data.pickle', 'rb') as picklefile:
    q1d = pickle.load(picklefile)

In [4]:
def newman_weight(a, b, author_dict, project_dict):
    common = author_dict[a].intersection(author_dict[b])
    weight = 0
    for project in common:
        weight += 1.0/(len(project_dict[project])-1)
    return weight

In [11]:
def construct_newman_graph(df, mirror = False):
    author_dict = collections.defaultdict(set)
    project_dict = collections.defaultdict(set)
    weight_dict = collections.defaultdict(float)
    
    edge_seen = set()
    
    for i in range(df.shape[0]):
        project_dict[df.iloc[i,0]].add(df.iloc[i,1])
        author_dict[df.iloc[i,1]].add(df.iloc[i,0])
        
    projects = df['project_id'].unique()
    authors = df['author_id'].unique()
    
    for project in projects:
        p_authors = list(project_dict[project])
        edges = combinations(p_authors, 2)
        for src_id, dst_id in edges:
            if src_id > dst_id:
                src_id, dst_id = dst_id, src_id
            if (src_id, dst_id) not in edge_seen:
                weight_dict[(src_id, dst_id)] = newman_weight(src_id, dst_id, author_dict, project_dict)
    
    weight_dict2 = deepcopy(weight_dict)
    if mirror:
        for a,b in weight_dict.keys():
            weight_dict2[(b,a)] = weight_dict[(a,b)]
          
    return author_dict, project_dict, weight_dict2

In [12]:
author_dict, project_dict, weight_dict = construct_newman_graph(q1d, True)

In [14]:
def get_neighbors(author_dict, project_dict):
    i = 0
    print ('Total authors to be processed:', len(author_dict.keys()))
    print ('------------------------------------')
    neighbor_dict = collections.defaultdict(set)
    for author in author_dict:
        i += 1
        if i%10000 == 0:
            print ('Processing author', i)
        for project in author_dict[author]:
            neighbor_dict[author] = neighbor_dict[author].union(project_dict[project])
        neighbor_dict[author].remove(author)
    return neighbor_dict

In [15]:
neighbor_dict = get_neighbors(author_dict, project_dict)

Total authors to be processed: 176061
------------------------------------
Processing author 10000
Processing author 20000
Processing author 30000
Processing author 40000
Processing author 50000
Processing author 60000
Processing author 70000
Processing author 80000
Processing author 90000
Processing author 100000
Processing author 110000
Processing author 120000
Processing author 130000
Processing author 140000
Processing author 150000
Processing author 160000
Processing author 170000


In [16]:
with open('./data/2016_q1_newman-weights.pickle', 'wb') as picklefile:
    pickle.dump(weight_dict, picklefile)
with open('./data/2016_q1_newman-neighbours.pickle', 'wb') as picklefile:
    pickle.dump(neighbor_dict, picklefile)

## Generate Graph for 2016 Q2

In [17]:
with open('./data/2016_q2_data.pickle', 'rb') as picklefile:
    q2d = pickle.load(picklefile)

In [18]:
author_dict2, project_dict2, weight_dict2 = construct_newman_graph(q2d, True)

In [19]:
neighbor_dict2 = get_neighbors(author_dict2, project_dict2)

Total authors to be processed: 141583
------------------------------------
Processing author 10000
Processing author 20000
Processing author 30000
Processing author 40000
Processing author 50000
Processing author 60000
Processing author 70000
Processing author 80000
Processing author 90000
Processing author 100000
Processing author 110000
Processing author 120000
Processing author 130000
Processing author 140000


In [21]:
with open('./data/2016_q2_newman-weights.pickle', 'wb') as picklefile:
    pickle.dump(weight_dict2, picklefile)
with open('./data/2016_q2_newman-neighbours.pickle', 'wb') as picklefile:
    pickle.dump(neighbor_dict2, picklefile)

In [22]:
len(neighbor_dict2.keys())

141583

In [24]:
with open('./data/2016_q2_newman-neighbours_1.pickle', 'rb') as picklefile:
    neighbours_1 = pickle.load(picklefile, encoding='latin1')
    

In [25]:
len(neighbours_1.keys())

141583