<a href="https://colab.research.google.com/github/qwiksilva/cs224w-github-rec/blob/master/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import joblib

from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
from absl import logging
# Reduce logging output.
logging.set_verbosity(logging.ERROR)

%matplotlib inline
sns.set(color_codes=True)

home = "/gdrive/My Drive/Colab Notebooks/cs224w-data/final-data-12062019"

In [0]:
train_comments_df = joblib.load(os.path.join(home, 'train_comments_df_v4_2018_only.joblib'))
test_comments_df = joblib.load(os.path.join(home, 'test_comments_df_v4_2018_only.joblib'))
filepath_summary_df = joblib.load(os.path.join(home, 'filepath_feat_df_v4_2018_only.joblib'))
train_dict = joblib.load(os.path.join(home, 'train_dict_v4_2018_only.joblib'))
test_dict = joblib.load(os.path.join(home, 'test_dict_v4_2018_only.joblib'))

In [0]:
train_user_list = set(train_comments_df.commenter_id.unique().tolist())
test_user_list = set(test_comments_df.commenter_id.unique().tolist())
train_pr_list = set(train_comments_df.pr_id.unique().tolist())
test_pr_list = set(test_comments_df.pr_id.unique().tolist())
print(len(train_user_list), len(test_user_list), len(train_pr_list), len(test_pr_list))

690 366 2719 3837


In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

def generate_pr_feat_matrix(pr_id_list, filepath_summary_df, train=True, scaler_dict={}):
    col_list = ['average_additions', 'average_changes', 'average_deletions', 'filepath_count', 'c2v_title', 'average_c2v_filepath']
    extract = filepath_summary_df.loc[pr_id_list, col_list[-2:]]
    collection = []
    for feat_name in extract.columns:
        column = extract.loc[:, feat_name].values
        if 'c2v' in feat_name:
            column = np.vstack(column.tolist())
        else:
            if train:
                scaler = StandardScaler()
                column = scaler.fit_transform(column.reshape(-1, 1)) # Potential leakage
                scaler_dict[feat_name] = scaler
            else:
                column = scaler_dict[feat_name].transform(column.reshape(-1, 1))
        # else:
        #     column = column.reshape(-1, 1)
        collection.append(column)
    pr_feat_matrix = np.hstack(collection)
    pr_list = extract.index.values.tolist() # Match up with cosine
    pr_cosine_index_map = dict(zip(pr_list, range(len(pr_list))))
    return pr_feat_matrix, pr_cosine_index_map, scaler_dict

In [0]:
%%time
train_pr_feat_matrix, train_pr_cosine_index_map, train_scaler_dict = generate_pr_feat_matrix(train_pr_list, filepath_summary_df)
print(train_pr_feat_matrix.shape)
train_cosine_sim_matrix = cosine_similarity(train_pr_feat_matrix, train_pr_feat_matrix)
# train_cosine_sim_matrix = - euclidean_distances(train_pr_feat_matrix, train_pr_feat_matrix)

(2719, 200)
CPU times: user 113 ms, sys: 38.1 ms, total: 151 ms
Wall time: 93.2 ms


In [0]:
%%time
test_pr_feat_matrix, test_pr_cosine_index_map, _ = generate_pr_feat_matrix(test_pr_list, filepath_summary_df, train=False, scaler_dict=train_scaler_dict)
test_cosine_sim_matrix = cosine_similarity(test_pr_feat_matrix, train_pr_feat_matrix)
# test_cosine_sim_matrix = - euclidean_distances(test_pr_feat_matrix, train_pr_feat_matrix)

CPU times: user 166 ms, sys: 8.85 ms, total: 174 ms
Wall time: 110 ms


In [0]:
print(train_cosine_sim_matrix.shape, test_cosine_sim_matrix.shape)

(2719, 2719) (3837, 2719)


In [0]:
def generate_pr_user_feat_df(core_df):
    combine = [core_df.groupby(['commenter_id', 'pr_id'])['comment_created_at'].min(),
               core_df.groupby(['commenter_id', 'pr_id'])['comment_created_at'].max(),
               core_df.groupby(['commenter_id', 'pr_id'])['comment'].count()]
    colnames = ['first_comment', 'last_comment', 'num_comment']
    pr_user_feat_df = pd.concat(combine, axis=1)
    pr_user_feat_df.columns = colnames
    user_gb_list = [core_df.groupby(['user_id', 'pr_id'])['username'].nunique().groupby('user_id').count(),
                    core_df.groupby(['commenter_id', 'pr_id'])['commenter_login'].nunique().groupby('commenter_id').count()]
    user_df = pd.concat(user_gb_list, axis=1)
    user_df.columns = ['num_pr_submitted', 'num_pr_commented']
    user_df.rename_axis('commenter_id')
    pr_user_feat_df = pr_user_feat_df.join(user_df, on='commenter_id').fillna(value=0)
    pr_user_feat_df = pr_user_feat_df.join(core_df.groupby('pr_id')['pr_created_at'].max(), on='pr_id')
    pr_user_feat_df['first_comment_age'] = (pr_user_feat_df['first_comment'] - pr_user_feat_df['pr_created_at']).apply(lambda x: x.total_seconds()/86400)
    pr_user_feat_df['last_comment_age'] = (pr_user_feat_df['last_comment'] - pr_user_feat_df['pr_created_at']).apply(lambda x: x.total_seconds()/86400)
    return pr_user_feat_df

train_pr_user_df = generate_pr_user_feat_df(train_comments_df)
test_pr_user_df = generate_pr_user_feat_df(test_comments_df)

In [0]:
train_pr_user_df

Unnamed: 0_level_0,Unnamed: 1_level_0,first_comment,last_comment,num_comment,num_pr_submitted,num_pr_commented,pr_created_at,first_comment_age,last_comment_age
commenter_id,pr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
278,59050,2018-02-02 14:18:54+00:00,2018-02-02 14:18:54+00:00,1,12.0,56.0,2018-01-30 16:47:51+00:00,2.896562,2.896562
278,59097,2018-02-02 20:44:55+00:00,2018-02-02 20:45:04+00:00,2,12.0,56.0,2018-01-31 07:48:15+00:00,2.539352,2.539456
278,59170,2018-02-02 14:25:34+00:00,2018-02-09 17:25:18+00:00,6,12.0,56.0,2018-02-01 05:56:32+00:00,1.353495,8.478310
278,59350,2018-02-06 01:30:04+00:00,2018-02-06 01:30:04+00:00,1,12.0,56.0,2018-02-05 14:46:12+00:00,0.447130,0.447130
278,59535,2018-03-02 19:25:37+00:00,2018-03-02 19:25:37+00:00,1,12.0,56.0,2018-02-08 06:48:20+00:00,22.525891,22.525891
...,...,...,...,...,...,...,...,...,...
37849960,62774,2018-05-02 17:26:18+00:00,2018-05-02 17:26:18+00:00,1,0.0,1.0,2018-04-18 09:20:41+00:00,14.337234,14.337234
38797363,62655,2018-04-27 21:48:47+00:00,2018-04-27 21:48:47+00:00,1,0.0,1.0,2018-04-16 15:08:33+00:00,11.277940,11.277940
38913308,62903,2018-05-03 07:53:16+00:00,2018-05-03 07:54:10+00:00,2,0.0,2.0,2018-04-20 09:36:27+00:00,12.928345,12.928970
38913308,63392,2018-05-03 08:50:55+00:00,2018-05-03 08:50:55+00:00,1,0.0,2.0,2018-05-03 08:30:26+00:00,0.014225,0.014225


In [0]:
train_index2pr_cosine_map = {}
for train_pr_id, row_index in train_pr_cosine_index_map.items():
    train_index2pr_cosine_map[row_index] = train_pr_id

expertise_result = {}
# Expertise Score
for test_pr_id, gt_test_reviewers in tqdm(test_dict.items(), desc="Expertise"):
    row_index = test_pr_cosine_index_map[test_pr_id]
    sorted_idx = np.argsort(-test_cosine_sim_matrix[row_index,])
    candidates = defaultdict(int)
    for idx in sorted_idx[:100]:
        neighbor_train_pr_id = train_index2pr_cosine_map[idx]
        train_reviewers = train_dict[neighbor_train_pr_id]
        for reviewer in train_reviewers:
            candidates[reviewer] += 1
    candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    expertise_result[test_pr_id] = candidates

HBox(children=(IntProgress(value=0, description='Expertise', max=3837, style=ProgressStyle(description_width='…




In [0]:
expertise_result_max = {}
# Expertise Score
for test_pr_id, gt_test_reviewers in tqdm(test_dict.items(), desc="Expertise"):
    row_index = test_pr_cosine_index_map[test_pr_id]
    nearest_idx = np.argmax(test_cosine_sim_matrix[row_index,])
    nearest_train_pr_id = train_index2pr_cosine_map[nearest_idx]
    train_reviewers = train_dict[neighbor_train_pr_id]
    expertise_result_max[test_pr_id] = list(zip(train_reviewers, range(len(train_reviewers), 0, -1)))

HBox(children=(IntProgress(value=0, description='Expertise', max=3837, style=ProgressStyle(description_width='…




In [0]:
def generate_results(pred_dict, test_dict, top_k=10):
    ap = 0.0
    ar = 0.0
    amrr = 0.0
    af1 = 0.0
    for test_pr_id, gt_test_reviewers in test_dict.items():
        pred = pred_dict[test_pr_id]
        ranked_candidates = np.array([cand for cand, score in sorted(pred, key=lambda x: x[1], reverse=True)])
        pred_set = set(ranked_candidates[:top_k])
        actual_set = set(gt_test_reviewers)
        precision = len(pred_set & actual_set) / len(pred_set)
        recall = len(pred_set & actual_set) / len(actual_set)
        f1 = (2 * precision * recall) / (precision + recall + 10**(-10))
        gt_rank = []
        mrr = 0.0
        for gt_reviewer in gt_test_reviewers:
            lookup = np.argwhere(ranked_candidates == gt_reviewer).tolist()
            if len(lookup) == 0:
                rank = 0
            else:
                rank = lookup[0][0] + 1
                mrr += 1/rank
            gt_rank.append(rank)
        amrr += mrr/len(gt_rank)
        ap += precision
        ar += recall
        af1 += f1
    return ap/len(test_dict), ar/len(test_dict), af1/len(test_dict), amrr/len(test_dict)

In [0]:
generate_results(expertise_result_max, test_dict, top_k=10)

(0.15011727912431588,
 0.061646745950889884,
 0.08135952021244604,
 0.061646745950889884)

In [0]:
generate_results(expertise_result, test_dict, top_k=10)

(0.09499609069585853,
 0.3968054272198131,
 0.14588371602821426,
 0.18496958860953322)

In [0]:
def find_username(user_id_list, comments_df, commenter_id='commenter_id', commenter_login='commenter_login'):
    return comments_df.loc[comments_df[commenter_id].isin(user_id_list), [commenter_id, commenter_login]].drop_duplicates().values

In [0]:
find_username([cand for cand, _ in popular_commenters_pred_dict[70431][:20]], test_comments_df)

array([[1431969, 'cblecker'],
       [5449021, 'dixudx'],
       [23304, 'dims'],
       [169553, 'timothysc'],
       [647318, 'lavalamp'],
       [730123, 'sttts'],
       [980082, 'liggitt'],
       [5595220, 'thockin'],
       [8225098, 'deads2k'],
       [10743879, 'wojtek-t']], dtype=object)

In [0]:
find_username([cand for cand, _ in expertise_result[70431][:20]], test_comments_df)

array([[1431969, 'cblecker'],
       [5449021, 'dixudx'],
       [23304, 'dims'],
       [917931, 'BenTheElder'],
       [1613024, 'mbohlool'],
       [169553, 'timothysc'],
       [576341, 'soltysh'],
       [647318, 'lavalamp'],
       [730123, 'sttts'],
       [980082, 'liggitt'],
       [1745006, 'jsafrane'],
       [1787169, 'mikedanese'],
       [5595220, 'thockin'],
       [8061296, 'ixdy'],
       [8225098, 'deads2k'],
       [10052848, 'saad-ali'],
       [10743879, 'wojtek-t'],
       [14308438, 'bsalamat'],
       [24448061, 'msau42'],
       [29742491, 'tallclair']], dtype=object)

In [0]:
find_username([cand for cand, _ in expertise_result[70431][:20]], test_comments_df)

array([[1431969, 'cblecker'],
       [5449021, 'dixudx'],
       [23304, 'dims'],
       [917931, 'BenTheElder'],
       [1613024, 'mbohlool'],
       [169553, 'timothysc'],
       [576341, 'soltysh'],
       [647318, 'lavalamp'],
       [730123, 'sttts'],
       [980082, 'liggitt'],
       [1745006, 'jsafrane'],
       [1787169, 'mikedanese'],
       [5595220, 'thockin'],
       [8061296, 'ixdy'],
       [8225098, 'deads2k'],
       [10052848, 'saad-ali'],
       [10743879, 'wojtek-t'],
       [14308438, 'bsalamat'],
       [24448061, 'msau42'],
       [29742491, 'tallclair']], dtype=object)

In [0]:
find_username(test_dict[70431], test_comments_df)

array([[11345431, 'PatrickLang'],
       [169553, 'timothysc'],
       [10524058, 'yujuhong']], dtype=object)

In [0]:
freq = []
for pr_id, user_list in train_dict.items():
    freq.extend(user_list)

In [0]:
bincount = np.bincount(freq)

In [0]:
popular_commenters = np.argsort(-bincount)

In [0]:
from collections import Counter
popular_commenters = Counter(freq)
popular_commenters = sorted(popular_commenters.items(), key=lambda x: x[1], reverse=True)

In [0]:
popular_commenters_pred_dict = {}
for test_pr_id, score in test_dict.items():
    popular_commenters_pred_dict[test_pr_id] = popular_commenters

In [0]:
generate_results(popular_commenters_pred_dict, test_dict)

(0.057701329163407304,
 0.22552379786547172,
 0.08714895429038405,
 0.11723791009130718)

In [0]:
for idx in np.argsort(-test_cosine_sim_matrix[test_pr_cosine_index_map[70431],])[:3]:
    print(idx, train_index2pr_cosine_map[idx])

2511 64130
808 60552
1255 61405


In [0]:
test_cosine_sim_matrix[test_pr_cosine_index_map[70431], 644]

0.9998688120513776

In [0]:
from scipy.spatial.distance import cosine
1 - cosine(train_pr_feat_matrix[644], test_pr_feat_matrix[test_pr_cosine_index_map[70431]])

0.9998688120513776

In [0]:
train_pr_feat_matrix[644], test_pr_feat_matrix[test_pr_cosine_index_map[70431]]

(array([1.        , 1.33333333, 0.33333333, 3.        ]),
 array([ 9.78571429, 12.64285714,  2.85714286, 28.        ]))

In [0]:
from scipy.spatial.distance import cosine
1 - cosine(train_pr_feat_matrix[644], test_pr_feat_matrix[test_pr_cosine_index_map[70431]])

0.6528006792068481

In [0]:
test_cosine_sim_matrix[test_pr_cosine_index_map[70431]]

array([-33.94136168,  -9.88490765, -87.03782666, ..., -27.68080362,
       -23.97092407, -29.72221164])

In [0]:
def dump_stuff(obj, fname):
    return joblib.dump(obj, os.path.join(home, '%s_v4_2018_only.joblib'%fname))

In [0]:
dump_stuff(train_cosine_sim_matrix, 'train_cosine_sim_matrix')
dump_stuff(train_pr_cosine_index_map, 'train_pr_cosine_index_map')

['/gdrive/My Drive/Colab Notebooks/cs224w-data/final-data-12062019/train_pr_cosine_index_map_v4_2018_only.joblib']

In [0]:
dump_stuff(test_cosine_sim_matrix, 'test_cosine_sim_matrix')
dump_stuff(test_pr_cosine_index_map, 'test_pr_cosine_index_map')

['/gdrive/My Drive/Colab Notebooks/cs224w-data/final-data-12062019/test_pr_cosine_index_map_v4_2018_only.joblib']