<a href="https://colab.research.google.com/github/qwiksilva/cs224w-github-rec/blob/master/SRW_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')

home = "/gdrive/My Drive/Colab Notebooks/cs224w-data"

In [0]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import Counter
from datetime import datetime
import time
from tqdm.auto import tqdm
from collections import defaultdict
from scipy import sparse
import time
import functools
import numpy as np
from scipy.optimize import fmin_bfgs, fmin_l_bfgs_b
import os
import array
from datetime import datetime as dt
from pprint import pprint
import pytz
import random
import joblib
import itertools

In [0]:
def load_srw_data(path="/gdrive/My Drive/Colab Notebooks/cs224w-data/final-data-12062019"):
  "Load the required files"

  # train_srw_output = joblib.load(os.path.join(path, 'train_srw_output_v2.joblib'))
  # train_node_mapping = joblib.load(os.path.join(path, 'train_node_mapping_v2.joblib'))
  # test_srw_output= joblib.load(os.path.join(path, 'test_srw_output_v2.joblib'))
  # test_node_mapping = joblib.load(os.path.join(path, 'test_node_mapping_v2.joblib'))

  train_srw_output = joblib.load(os.path.join(path, 'train_srw_output_v4_2018_only.joblib'))
  train_node_mapping = joblib.load(os.path.join(path, 'train_node_mapping_v4_2018_only.joblib'))
  test_srw_output= joblib.load(os.path.join(path, 'test_srw_output_v4_2018_only.joblib'))
  test_node_mapping = joblib.load(os.path.join(path, 'test_node_mapping_v4_2018_only.joblib'))

  train_comments = joblib.load(os.path.join(path, 'train_comments_df_v4_2018_only.joblib'))
  test_comments = joblib.load(os.path.join(path, 'test_comments_df_v4_2018_only.joblib'))


  dtypes = {
    'comment': str,
    'commenter_id': int,
    'commenter_login': str, 
    'pr_id': int,
    'user_id': int,
    'username':str,
    'title': str,
    'description': str,
    'pr_created_at': str,
    'comment_created_at': str,
  }

  comments = pd.concat([train_comments,test_comments])
  comments = comments.reset_index()

  return train_srw_output, train_node_mapping, test_srw_output, test_node_mapping, comments


In [0]:
def create_srw_data_structs(time_based_cutoff=False, path="/gdrive/My Drive/Colab Notebooks/cs224w-data/final-data-12062019"):
  print('preparing data...')
  train_srw_output, train_node_mapping, test_srw_output, test_node_mapping, comments = load_srw_data(path)
  cutoff_date = dt(2018, 6, 1)
  utc=pytz.UTC
  cutoff_date = utc.localize(cutoff_date)
  unique_train_prs, unique_test_prs, train_comments, test_comments = split_comments(comments, cutoff_date)

  print('getting source prs...')
  if time_based_cutoff:
    source_prs, D_sets = get_source_prs_time_based(train_comments, cutoff_date)

  else:
    source_prs = get_source_prs(train_comments)
    D_sets = {}
    for source_id in source_prs:
      existing_link_commenters, D_set = get_source_info(source_id, train_comments)
      D_sets[source_id] = D_set

  D_sets = map_D_sets_to_idx(D_sets, train_node_mapping)

  print('creating graph...')
  graph = nx.Graph()
  graph = create_bipartite_graph(graph, train_srw_output, D_sets)

  L_sets = get_L_sets(graph, D_sets, max_size=100)

  remove_D_set_features(train_srw_output, D_sets)

  test_D_sets = {}
  for pr_id in unique_test_prs:
    existing_link_commenters, test_D_set = get_source_info(pr_id, test_comments)
    if len(existing_link_commenters) > 1 and len(test_D_set) > 1:
      test_D_sets[pr_id] = test_D_set

  test_D_sets = map_D_sets_to_idx(test_D_sets, test_node_mapping)

  test_graph = nx.Graph()
  test_graph = create_bipartite_graph(test_graph, test_srw_output, test_D_sets)
  remove_D_set_features(test_srw_output, test_D_sets)
  # train_graph, idx_to_future_links, idx_to_no_links, nodelist = create_train_bipartite_graph(train_graph, train_comments, source_prs)

  return {
      'unique_train_prs': unique_train_prs,
      'unique_test_prs': unique_test_prs,
      'train_comments': train_comments,
      'test_comments': test_comments,
      'source_prs': source_prs,
      'graph': graph,
      'D_sets': D_sets,
      'L_sets': L_sets,
      'train_features':train_srw_output, 
      'train_node_mapping':train_node_mapping, 
      'test_features':test_srw_output, 
      'test_node_mapping':test_node_mapping,
      'test_graph': test_graph,
      'test_D_sets':test_D_sets
  }


def map_D_sets_to_idx(D_sets, node_mapping):
  new_D_sets = {}
  for pr_id in D_sets:
    new_D_set = set()
    for commenter_id in D_sets[pr_id]:
      new_D_set.add( node_mapping[(commenter_id, 'User')] )
    new_D_sets[ node_mapping[(pr_id, 'PR')] ] = new_D_set

  return new_D_sets


def split_comments(comments, cutoff_date):  
  unique_train_prs = set(comments[comments.pr_created_at < cutoff_date]['pr_id'].unique())
  unique_test_prs = set(comments[comments.pr_created_at >= cutoff_date]['pr_id'].unique())

  train_comments = comments[comments['pr_id'].isin(unique_train_prs)]
  test_comments = comments[comments['pr_id'].isin(unique_test_prs)]

  # train_comments = comments[comments['comment_created_at'] < cutoff_date]
  # test_comments = comments[comments['comment_created_at'] >= cutoff_date]

  return unique_train_prs, unique_test_prs, train_comments, test_comments


def get_source_prs_time_based(train_comments, cutoff_date):
  # after_cutoff = train_comments[train_comments['comment_created_at'] >= cutoff_date].groupby('pr_id')['commenter_id'].unique()
  # before_cutoff = train_comments[train_comments['comment_created_at'] < cutoff_date].groupby('pr_id')['commenter_id'].unique()

  after_cutoff = train_comments[train_comments['comment_created_at'] >= cutoff_date]['pr_id'].unique()
  before_cutoff = train_comments[train_comments['comment_created_at'] < cutoff_date]['pr_id'].unique()

  valid_commenters = set(train_comments[train_comments['comment_created_at'] < cutoff_date]['commenter_id'].unique())

  source_prs = []
  D_sets = {}
  for pr_id in after_cutoff:
    if pr_id not in before_cutoff:
      continue

    commenters_after = set(train_comments[(train_comments['comment_created_at'] >= cutoff_date) & (train_comments['pr_id'] == pr_id)]['commenter_id'].unique())
    commenters_before = set(train_comments[(train_comments['comment_created_at'] < cutoff_date) & (train_comments['pr_id'] == pr_id)]['commenter_id'].unique())
    D_set = (commenters_after - commenters_before).intersection(valid_commenters)
    owner = train_comments[train_comments['pr_id'] == pr_id]['user_id'].unique()[0]
    if owner in D_set:
      D_set.remove(owner)
    if len(commenters_before) > 1 and len(D_set) > 1:
      source_prs.append(pr_id)
      D_sets[pr_id] = D_set

  return source_prs, D_sets

def get_source_prs(train_comments, min_reviewers=4, max_reviewers=16, num_source_prs=200):
  random.seed(42)
  candidate_train_prs = train_comments.groupby('pr_id')['commenter_id'].nunique()
  candidate_train_prs = set(candidate_train_prs[(candidate_train_prs >= min_reviewers) & (candidate_train_prs <= max_reviewers)].index)
  source_prs = random.sample(candidate_train_prs, num_source_prs)
  return source_prs

def get_source_info(source_id, train_comments):
  source_comments = train_comments[train_comments['pr_id'] == source_id].sort_values('comment_created_at')
  num_commenters = source_comments['commenter_id'].nunique()
  num_existing_links = int(num_commenters / 2)

  existing_link_commenters = set()
  future_link_commenters = set()
  owner = train_comments[train_comments['pr_id'] == source_id]['user_id'].unique()[0]
  if owner in source_comments['commenter_id'].unique():
    existing_link_commenters.add(owner)
  for idx, comment in source_comments.iterrows():
    if len(existing_link_commenters) < num_existing_links:
      existing_link_commenters.add(comment['commenter_id'])

    elif len(existing_link_commenters) == num_existing_links and comment['commenter_id'] not in existing_link_commenters:
      future_link_commenters.add(comment['commenter_id'])

  return existing_link_commenters, future_link_commenters

def create_bipartite_graph(graph, features, D_sets):
  features = features['num_comment'].tocoo()
  for node1_idx, node2_idx in zip(features.row, features.col):
    if node2_idx in D_sets and node1_idx in D_sets[node2_idx]:
      continue
    if node1_idx in D_sets and node2_idx in D_sets[node1_idx]:
      continue

    graph.add_edge(node1_idx, node2_idx)

  assert nx.bipartite.is_bipartite(graph)
  return graph


def get_L_sets(graph, D_sets, max_size=None):
  L_sets = {}
  random.seed(42)
  for source_node in D_sets:
    path_lengths = nx.single_source_shortest_path_length(graph, source_node, cutoff=3)
    L_set = set([node for node in path_lengths if path_lengths[node]==3]) - D_sets[source_node]
    if max_size and len(L_set) > max_size:
      L_set = random.sample(L_set, max_size)
    L_sets[source_node] = L_set
  return L_sets

def remove_D_set_features(features, D_sets):
  for node1 in D_sets:
    for node2 in D_sets[node1]:
      for feature_matrix in features.values():
        feature_matrix[node1, node2] = 0
        feature_matrix[node2, node1] = 0

def create_train_bipartite_graph(graph, train_comments, source_prs):
  non_source_prs = set(train_comments['pr_id'].unique()) - set(source_prs)
  non_source_comments = train_comments[train_comments['pr_id'].isin(non_source_prs)]
  edges = set()

  pr_groups = non_source_comments.groupby('pr_id')['commenter_id'].unique()
  for pr_id, commenter_ids in pr_groups.iteritems():
    pr_node_id = f"p{int(pr_id)}"

    for commenter_id in commenter_ids:
      commenter_node_id = f"u{int(commenter_id)}"
      edge = (pr_node_id, commenter_node_id)
      edges.add(edge)

  nodelist = []
  for pr_id in train_comments.sort_values('pr_created_at')['pr_id'].unique().tolist():
    pr_node_id = f"p{int(pr_id)}"
    nodelist.append(pr_node_id)

  for commenter_id in train_comments['commenter_id'].unique():
    commenter_node_id = f"u{int(commenter_id)}"
    nodelist.append(commenter_node_id)

  idx_to_future_links = {}
  for source_id in source_prs:
    pr_node_id = f"p{int(source_id)}"
    existing_link_commenters, future_link_commenters = get_source_info(source_id, train_comments)
    idx_to_future_links[()] = future_link_commenters
    for commenter_id in existing_link_commenters:
      commenter_node_id = f"u{int(commenter_id)}"
      edge = (pr_node_id, commenter_node_id)
      edges.add(edge)

  graph.add_edges_from(edges)
  assert nx.bipartite.is_bipartite(graph)

  idx_to_no_links = {}
  for source_id in source_prs:
    pr_node_id = f"p{int(source_id)}"
    path_lengths = nx.single_source_shortest_path_length(graph, pr_node_id, cutoff=3)
    no_link_set = set([node for node in path_lengths if path_lengths[node]==3])
    idx_to_no_links[nodelist.index(pr_node_id)] = no_link_set - idx_to_future_links[nodelist.index(pr_node_id)]

  return graph, idx_to_future_links, idx_to_no_links, nodelist


In [0]:
data = create_srw_data_structs(time_based_cutoff=False)

In [0]:
list(data)

In [0]:
test_nodes = data['test_comments'].pr_id.unique()

In [0]:
graph = data['graph']
adjacency = nx.adjacency_matrix(graph, nodelist=range(len(graph)))

D_set = data['D_sets']
L_set = data['L_sets']
features = data['train_features']
test_features  = data['test_features']
train_nodes = [node for node in D_set]
test_D_sets = data['test_D_sets']

for k, m in features.items():
    features[k] = sparse.csr_matrix(m)

for k, m in test_features.items():
    test_features[k] = sparse.csr_matrix(m)

weights = np.random.uniform(1, 0, [len(features)])

In [0]:
D_set_size = np.array([len(v) for k, v in D_set.items()])
print(len(D_set_size), D_set_size.mean(), D_set_size.max())

test_D_set_size = np.array([len(v) for k, v in test_D_sets.items()])
print(len(test_D_set_size), test_D_set_size.mean(), test_D_set_size.max())

L_set_size = np.array([len(v) for k, v in L_set.items()])
print(L_set_size.mean(), L_set_size.max())

combined_size = np.array([L_set_size[i]*D_set_size[i] for i in range(len(D_set_size))])
print(combined_size.mean(), combined_size.max())

degrees = []
for k in D_set:
  degrees.append(graph.degree[k])
degrees = np.array(degrees)
print(degrees.mean(), degrees.max())

In [0]:
# graph = nx.DiGraph()
# graph = nx.read_weighted_edgelist("kubernetes_comment_network.edgelist", create_using=graph)
# adjacency = nx.adjacency_matrix(graph)
# # train_comments = pd.read_csv('kubernetes_train_comments.csv')
# # file_paths = pd.read_csv('kubernetes_file_paths_final.csv')
# # test_comments = pd.read_csv('kubernetes_test_comments.csv')

# #train_comments['comment_time_tics'] = train_comments['comment_created_at'].map(convert_to_tic)

In [0]:
# rows, cols = adjacency.nonzero()
# count = 0
# train_nodes = set()
# train_edges = []

# for row, col in zip(rows, cols):
    
#     if len(train_nodes) <= 2 and np.sum(adjacency[row, ]) >= 10:
#         train_nodes.add(row)
    
#     if row in train_nodes:
#         train_edges.append((row, col))

# D_set = {k:[] for k in train_nodes}
# L_set = {k:[] for k in train_nodes}
# # 
# for row, col in train_edges:
#     if np.random.uniform() > 0.5:
#         adjacency[row, col] = 0
#         D_set[row].append(col)

# for i in range(adjacency.shape[0]):
#     for k in L_set:
#         if i not in D_set[k]:
#             L_set[k].append(i)
            
        

In [0]:
# ##Making Features

# features = {
#     "ft1" : sparse.lil_matrix(adjacency.shape),
#     "ft2" : sparse.lil_matrix(adjacency.shape),
# }

# weights = np.random.uniform(1, 0, [len(features)])

# rows, cols = adjacency.nonzero()
# for row, col in zip(rows, cols):
#     features["ft1"][row, col] = np.random.uniform(0, 1)
#     features["ft2"][row, col] = np.random.uniform(0, 1)
    
# for k, m in features.items():
#     features[k] = sparse.csr_matrix(m)


In [0]:
def features2EdgeStrength(fts, wts):
    a = sparse.csr_matrix((fts[list(fts)[0]].shape), dtype=float)
    for i, k in enumerate(fts):
        a = a + fts[k] * wts[i]

    logistic_data = (1 + np.exp(-a.data))
    logistic = a.copy()
    logistic.data = logistic_data
    
    l2 = np.reshape(logistic, a.shape)
    
    a.data =  1/logistic.data
        
    
    denominator = a ** 2
    weight_derivatives = []
    
    for k in range(len(wts)):
        # print ( logistic.shape, denominator.shape)
        weight_derivatives.append(wts[k] * logistic * denominator)
        
    
    #d_inv = sparse.diags([[1.0 / a.getrow(i).sum() for i in range(a.shape[0])]], [0])
    return a, weight_derivatives

def EdgeStrengthToTransitionMatrices(A, alpha, train_nodes):
    A_sum_inv = []
    for i in range(A.shape[0]):
      s = A.getrow(i).sum()
      if s != 0:
        A_sum_inv.append(1.0/s)
      else:
        A_sum_inv.append(0)

    A_sum_inv = sparse.diags([A_sum_inv], [0])
    T = A_sum_inv.dot(A)

    T_matrices = []
    restart = sparse.csr_matrix((A.shape), dtype=float)
    for n in train_nodes:
        # restart = np.zeros(A.shape)
        # for i in range(restart.shape[0]):
        #     restart[i, n] = 1
        # restart = sparse.csr_matrix((A.shape), dtype=float)
        restart[n] = 1
        T_matrices.append((1-alpha)*T + alpha*restart.transpose())
        restart[n] = 0
        restart.eliminate_zeros()
        
    return T_matrices

def iterPageRank(pr, trans, epsilon = 1e-4):
    pr = sparse.csr_matrix(pr)
    count = 0
    while True:
        pr_new = pr.dot(trans)
        # delta = np.sum(abs((pr_new - pr).data))
        delta = sparse.linalg.norm(pr_new - pr, ord=1)
        pr = pr_new
        count += 1
        if delta < epsilon:
            break
    
    return pr_new[0]

    
def costFunc(pl, pd, offset):
    return 1.0 / (1 + np.exp(-1.0 * (pl - pd)/offset))
    
def get_loss(D_set, L_set, weights, A, T_matrices, train_nodes, lmbda, offset = 0.1):
  cost = 0
  nnodes = A.shape[0]
  for i, ind in enumerate(train_nodes):
      
      pp = np.repeat(1.0/nnodes, nnodes)
      pgrank = iterPageRank(pp, T_matrices[i])
      
      
      for d in D_set[ind]:
          # deltas = (pgrank[0, L_set[ind]] - pgrank[0, d]
          # for d in deltas:
              # cost += costFunc(d, offset)
          for l in L_set[ind]:
              cost += costFunc(pgrank[0, l], pgrank[0, d], offset)  
              # cost += 1.0 / (1 + np.exp(-1.0 * (pgrank[0, l] - pgrank[0, d])/offset))                        

  loss = lmbda * np.sqrt(sum(wk ** 2 for wk in weights)) + cost 
  return loss

In [0]:
def diffTransition(A, dw, alpha = 0.2):

  data_dict = {}
  for k in range(len(dw)):
    data_dict[k] = {
        "rows": array.array('i'),
        "cols": array.array('i'),
        "data": array.array('f'),
    }
  nnodes = int(A.shape[0])
  row_weight_sums = np.array(A.sum(axis=1)).flatten()
  row_wdiff_sums = [np.array(w_diff.sum(axis=1)).flatten() for w_diff in dw]



  for i in tqdm(range(nnodes)):

      if row_weight_sums[i] == 0:
        continue

      for k, w_diff in enumerate(dw):

            denominator = (row_weight_sums[i] ** -2)
            #if denominator 
            numerator = (dw[k][i, ]*row_weight_sums[i]) - (row_wdiff_sums[k][i]*A[i, ])

            current_row =  (numerator * (1 - alpha)) * denominator
            current_data = current_row.data

            for j, col in enumerate(current_row.indices):
              data_dict[k]['rows'].append(i)
              data_dict[k]['cols'].append(col)
              data_dict[k]['data'].append(current_data[j])
            #print("rec  ", time.time() - t)

  dQ = []
  for k in data_dict:
    data = data_dict[k]['data']
    rows = data_dict[k]['rows']
    cols = data_dict[k]['cols']
    dQ.append(sparse.csr_matrix((data, (rows, cols)),
                             shape=(nnodes, nnodes)))
  
  return dQ

In [0]:
def iterPageDiff(p, trans, transdiff, epsilon = 1e-3, max_iter = 300):
    pdiff = sparse.csr_matrix(p.shape)
    p_transdiff = p.dot(transdiff)
    it = 0
    while True :
      pr_diff_new = pdiff.dot(trans) + p_transdiff
      delta = np.sum(np.abs(pr_diff_new - pdiff).data)
      #delta = sparse.linalg.norm(pr_diff_new - pdiff, ord=1)
      pdiff = pr_diff_new
      it += 1
      if it % 500 == 0:
        print(delta)
      if delta < epsilon:
          
          return pr_diff_new[0]

In [0]:
pp = np.repeat(1.0/A.shape[0], A.shape[0]) # may want to initialize pagerank with p from last iteration?
pgrank = iterPageRank(pp, T_matrices[0])


In [0]:
iterPageDiff(pgrank[0], T_matrices[0], diffQ[0])

In [0]:
def forward_pass(weights, D_Set, L_set, features, train_nodes, lmbda, alpha, offset):
  print("calculating EdgeStrength")
  A, dw = features2EdgeStrength(features, weights)
  print("calculating TransitionMatrices")
  T_matrices = EdgeStrengthToTransitionMatrices(A, alpha, train_nodes)
  print("calculating loss")
  loss = get_loss(D_set, L_set, weights, A, T_matrices, train_nodes, lmbda, offset)
  print("finished forward, loss:", loss)
  return loss

In [0]:
def costDiff(pl, pd, offset):
    return (1.0 / offset) * np.exp(-1.0 * (pl - pd)/offset) * (costFunc(pl, pd, offset) ** 2)

In [0]:
def lossDiff(T_matrices, diffQ, trian_nodes, offset, D_set, L_set):
  diffWeights = np.array([0] * len(diffQ))

  nnodes = T_matrices[0].shape[0]
  for i, ind in tqdm(enumerate(train_nodes)):

          pp = np.repeat(1.0/nnodes, nnodes) # may want to initialize pagerank with p from last iteration?
          pgrank = iterPageRank(pp, T_matrices[i])
          pgrank_vec = pgrank.toarray()[0]

          lossdiffs = defaultdict(dict)
          for d in D_set[ind]:
            for l in L_set[ind]:
                  lossdiffs[d][l] = costDiff(pgrank_vec[l], pgrank_vec[d], offset)
          

          for k in range(len(weights)):
            nodeDiff = 0
            pdiff = np.zeros((nnodes))
            pdiff = iterPageDiff(pgrank[0], T_matrices[i], diffQ[k])


            pdiff_vec = pdiff.toarray()[0]

            #need to pass in D_set and L_set to function?
            #tt = time.time()
            for d in D_set[ind]:
                  for l in L_set[ind]:

                      nodeDiff += lossdiffs[d][l]*(pdiff_vec[l] - pdiff_vec[d])


            diffWeights[k] += nodeDiff    

  return diffWeights   


In [0]:
def backward_pass(weights, D_Set, L_set, features, train_nodes, lmbda, alpha, offset):

  
  print("calculating EdgeStrength")
  A, dw = features2EdgeStrength(features, weights)
  print("calculating TransitionMatrices")
  T_matrices = EdgeStrengthToTransitionMatrices(A, alpha, train_nodes)

  nnodes = A.shape[0]
  print("calculating diffTransition")
  diffQ = diffTransition(A, dw)
  print("calculating lossDiff")
  diffWeights = lossDiff(T_matrices, diffQ, train_nodes, offset, D_set, L_set)

      
  for k in range(len(diffWeights)):
    diffWeights[k] += 2.0 * lmbda * weights[k]

  #print (diffWeights)
  print("finished backward")
  return np.array(diffWeights, dtype='float64')
    



In [0]:
backward_pass(D_set, L_set, features, train_nodes, 0.2, 0.3, 0.1, wht)

In [0]:
print( diffWeights, weights)

In [0]:
wht

In [0]:
max_iter = 500
lr = 1e-5
wht = weights
for i in range(max_iter):
  loss =  forward_pass(D_set, L_set, features, train_nodes, lmb, alpha, offset, wht)
  diffWeights = backward_pass (D_set, L_set, features, train_nodes, lmb, alpha, offset, wht)
  wht = [wht[i] - diffWeights[i] * lr for i in range(len(wht))]
  print("new weights", wht)
  print(("iteration %d, loss %f") % (i, loss))
  



In [0]:
fprintalpha = 0.3
A, dw = features2EdgeStrength(features, weights)
T_matrices = EdgeStrengthToTransitionMatrices(A, alpha, train_nodes)
diffQ = diffTransition(A, dw)

In [0]:
offset = 0.1
diffWeights = lossDiff(T_matrices, diffQ, train_nodes, offset, D_set, L_set)

In [0]:

forward_pass(D_set, L_set, features, train_nodes, lmb, alpha, offset, weights)

In [0]:
weights

In [0]:
backward_pass(D_set, L_set, features, train_nodes, 0.2, 0.3, 0.1, weights)

In [0]:
lmb = 1
alpha = 0.2
offset = 0.1
w = np.array(weights,dtype='complex', order='F')

In [0]:
beta_Opt = fmin_l_bfgs_b(func = forward_pass,
                         x0 = w, 
                         fprime=backward_pass,
                         args = (D_set, L_set, features, train_nodes, lmb, alpha, offset))
                        # approx_grad=True)


In [0]:
ind2map = {'User': {},
           'PR': {},
           }
for key in test_node_mapping:
  ID, typ =  key
  if typ == 'User':
    ind2map['User'][test_node_mapping[key]] = ID
  else:
    ind2map['PR'][test_node_mapping[key]] = ID

  #ind2map[test_node_mapping[key][0]] = 

In [0]:
test_comments

In [0]:
test_comments = data['test_comments']
seen_prs = {}
for pr, group in test_comments.groupby('pr_id'):
  if pr not in new_labels:
    continue
  cur_commenters = set(group.commenter_id.unique())
  cur_dset = set(new_labels[pr])

  seen_prs[pr] = cur_commenters - cur_dset

In [0]:
test_nodes = data['unique_test_prs']
test_nodes = [test_node_mapping[(k, 'PR')] for k in test_nodes]


In [0]:
data['test_D_sets']

In [0]:
ind2map['User'][ind]

In [0]:
labels = data['test_D_sets']
labels[]

In [0]:
labels.keys()

In [0]:
test_features = data['test_features']
test_nodes = labels.keys()#[test_node_mapping[(k, 'PR')] for k in data['unique_test_prs']]
preds = getPredictions(test_features, beta_Opt[0], ind2map, test_nodes)

In [0]:
for i in preds:
  preds[i] = sorted(preds[i], reverse = True)

In [0]:
preds.keys()

In [0]:
iter = 0
for pr_id, pred in preds.items():
    print([cand for score, cand in sorted(pred, key=lambda x: x[0], reverse=True)][:10])
    if iter == 10:
        break
    iter += 1

In [0]:
preds[65549][:20]

In [0]:
new_labels = {}
for k in labels:
  new_id = ind2map['PR'][k]
  new_labels[new_id] = [ind2map['User'][ind] for ind in labels[k]]


In [0]:
new_labels[65558]

In [0]:
test_nodes = [test_node_mapping[(k, 'PR')] for k in data['unique_test_prs']]
print(sorted(test_nodes))

In [0]:
print(sorted(labels.keys()))

In [0]:
p


In [0]:
data['test_comments'].pr_id.unique()[:2]

In [0]:
joblib.dump(preds, "SRW_predictions2.joblib")
joblib.dump(new_labels, "SRW_labels.joblib")

In [0]:
joblib.dump(preds, "seen_prs.joblib")


In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
uploaded = drive.CreateFile({'title': 'seen_prs.joblib'})
uploaded.SetContentFile('seen_prs.joblib')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

In [0]:
sorted(preds[67084], reverse = True)[:10]

In [0]:
def getPredictions(features, weights, node_mappings, nodes):
  A, _ = features2EdgeStrength(features, weights)
  nnodes = A.shape[0]
  T_matrices = EdgeStrengthToTransitionMatrices(A, alpha, nodes)
  #print(T_matrices[0], T_matrices[1])
  pred_dict = {}

  for i, row in tqdm(enumerate(nodes)):
    curid = node_mappings['PR'][row]
    pred_dict[curid] = []
    pp = np.repeat(1.0/nnodes, nnodes)
    pgrank = iterPageRank(pp, T_matrices[i])
    #print (pgrank.shape)
    scores = pgrank.toarray()[0]
    #ranking = np.argsort(scores)
    #rankings, score =  zip(*sorted(zip(scores, index)))
    for ind, s in enumerate(scores):
        if ind in node_mappings['User'] and ind != row:
          pred_dict[curid].append((s, node_mappings['User'][ind])) #= [k for k in zip(scores, list(range(len(scores))))]
                                    
  return pred_dict




In [0]:
print(test_node_mapping)