<a href="https://colab.research.google.com/github/qwiksilva/cs224w-github-rec/blob/master/evaluation_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import joblib

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
from absl import logging
# Reduce logging output.
logging.set_verbosity(logging.ERROR)

%matplotlib inline
sns.set(color_codes=True)

home = "/gdrive/My Drive/Colab Notebooks/cs224w-data/final-data-12062019"

In [0]:
train_comments_df = joblib.load(os.path.join(home, 'train_comments_df_v4_2018_only.joblib'))
test_comments_df = joblib.load(os.path.join(home, 'test_comments_df_v4_2018_only.joblib'))
filepath_summary_df = joblib.load(os.path.join(home, 'filepath_feat_df_v4_2018_only.joblib'))
train_dict = joblib.load(os.path.join(home, 'train_dict_v4_2018_only.joblib'))
test_dict = joblib.load(os.path.join(home, 'test_dict_v4_2018_only.joblib'))

In [0]:
def find_username(user_id_list, comments_df, commenter_id='commenter_id', commenter_login='commenter_login'):
    return comments_df.loc[comments_df[commenter_id].isin(user_id_list), [commenter_id, commenter_login]].drop_duplicates().values

def dump_stuff(obj, fname):
    return joblib.dump(obj, os.path.join(home, '%s_v4_2018_only.joblib'%fname))

def load_stuff(fname):
    return joblib.load(os.path.join(home, '%s_v4_2018_only.joblib'%fname))

def generate_results(pred_dict, test_dict, top_k=10, verbose=False):
    ap = 0.0
    ar = 0.0
    amrr = 0.0
    af1 = 0.0
    for test_pr_id, gt_test_reviewers in test_dict.items():
        pred = pred_dict[test_pr_id]
        ranked_candidates = np.array([cand for cand, score in sorted(pred, key=lambda x: x[1], reverse=True)])
        pred_set = ranked_candidates[:top_k]
        actual_set = set(gt_test_reviewers)
        precision = len(set(pred_set) & actual_set) / len(pred_set)
        recall = len(set(pred_set) & actual_set) / len(actual_set)
        f1 = (2 * precision * recall) / (precision + recall + 10**(-10))
        gt_rank = []
        mrr = 0.0
        for gt_reviewer in gt_test_reviewers:
            lookup = np.argwhere(ranked_candidates == gt_reviewer).tolist()
            if len(lookup) == 0:
                rank = 0
            else:
                rank = lookup[0][0] + 1
                mrr += 1/rank
            gt_rank.append(rank)
        amrr += mrr/len(gt_rank)
        ap += precision
        ar += recall
        af1 += f1
    if verbose:
        print("Top-%d-Average-Precision = %0.3f\nTop-%d-Average-Recall = %0.3f\nTop-%d-Average-F1 = %0.3f\nAverage-MRR = %0.3f" 
            %(top_k, ap/len(test_dict), top_k, ar/len(test_dict), top_k, af1/len(test_dict), amrr/len(test_dict)))
    return ap/len(test_dict), ar/len(test_dict), af1/len(test_dict), amrr/len(test_dict)

In [0]:
# Popularity baseline
from collections import Counter
import random
freq = []
for pr_id, user_list in train_dict.items():
    freq.extend(user_list)
popular_commenters = Counter(freq)
popular_commenters = sorted(popular_commenters.items(), key=lambda x: x[1], reverse=True)
popular_commenters_pred_dict = {} # {PR-id: [(cand1, score1), (cand2, score2), ]}
for test_pr_id, score in test_dict.items():
    popular_commenters_pred_dict[test_pr_id] = random.choices(popular_commenters, k=50, weights=[score for _, score in popular_commenters])

In [0]:
precs_pop = []
recall_pop = []
f1_pop = []
mrr_pop = []
hypeparam = range(10, 101, 10)

for n in hypeparam:
    p, r, f, m = generate_results(popular_commenters_pred_dict, test_dict, top_k=n, verbose=False)
    precs_pop.append(p)
    recall_pop.append(r)
    f1_pop.append(f)
    mrr_pop.append(m)

In [0]:
# Activeness baseline
core_df = pd.concat([train_comments_df, test_comments_df])
creation_date_lookup = core_df.groupby('pr_id')['pr_created_at'].max()
activeness_candidate = {}
for test_pr_id in test_dict.keys():
    past_df = core_df.loc[core_df.pr_created_at < creation_date_lookup[test_pr_id]]
    pr_contributed_count = past_df.groupby('commenter_id')['pr_id'].nunique().to_dict()
    activeness_candidate[test_pr_id] = sorted(pr_contributed_count.items(), key=lambda x: x[1], reverse=True)

In [0]:
_ = generate_results(activeness_candidate, test_dict, top_k=10, verbose=True)

In [0]:
precs_activeness = []
recall_activeness = []
f1_activeness = []
mrr_activeness = []
hypeparam = range(10, 101, 10)

for n in hypeparam:
    p, r, f, m = generate_results(activeness_candidate, test_dict, top_k=n, verbose=False)
    precs_activeness.append(p)
    recall_activeness.append(r)
    f1_activeness.append(f)
    mrr_activeness.append(m)

In [0]:
# Expertise baseline
train_pr_cosine_index_map = load_stuff('train_pr_cosine_index_map')
test_cosine_sim_matrix = load_stuff('test_cosine_sim_matrix')
test_pr_cosine_index_map = load_stuff('test_pr_cosine_index_map')

train_index2pr_cosine_map = {}
for train_pr_id, row_index in train_pr_cosine_index_map.items():
    train_index2pr_cosine_map[row_index] = train_pr_id

def generate_expertise_recset(num_neighbors=100):
    expertise_result = {}
    for test_pr_id, gt_test_reviewers in tqdm(test_dict.items(), desc="Expertise"):
        row_index = test_pr_cosine_index_map[test_pr_id]
        sorted_idx = np.argsort(-test_cosine_sim_matrix[row_index,])
        candidates = defaultdict(int)
        for idx in sorted_idx[:num_neighbors]: # Plot this hyperparameter
            neighbor_train_pr_id = train_index2pr_cosine_map[idx]
            train_reviewers = train_dict[neighbor_train_pr_id]
            for reviewer in train_reviewers:
                candidates[reviewer] += 1
        candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
        expertise_result[test_pr_id] = candidates
    return expertise_result

In [0]:
precs_exp = []
recall_exp = []
f1_exp = []
mrr_exp = []
hypeparam = range(10, 101, 10)
for n in hypeparam:
    expertise_recset = generate_expertise_recset(50)
    p, r, f, m = generate_results(expertise_recset, test_dict, top_k=n, verbose=False)
    precs_exp.append(p)
    recall_exp.append(r)
    f1_exp.append(f)
    mrr_exp.append(m)

In [0]:
fig, ax = plt.subplots(1, 1)
sns.lineplot(recall_exp, precs_exp, ax=ax, legend='full', label='Expertise')
sns.lineplot(recall_pop, precs_pop, ax=ax, legend='full', label='Popularity')
sns.lineplot(recall_earec, precs_earec, ax=ax, legend='full', label='EARec')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_title('Precision-Recall')
fig.savefig('prec_recall_plot.png')

In [0]:
fig, ax = plt.subplots(1, 1)
sns.lineplot(recall_exp, precs_exp, ax=ax, legend='full', label='Expertise')
sns.lineplot(recall_activeness, precs_activeness, ax=ax, legend='full', label='Activeness Model')
sns.lineplot(recall_earec, precs_earec, ax=ax, legend='full', label='EARec + RWR')
sns.lineplot(recall_srw, precs_srw, ax=ax, legend='full', label='Bipartie SRW')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_title('Precision-Recall')
fig.savefig('prec_recall_plot_activeness.png')

In [0]:
print(np.max(mrr_pop))
print(np.max(mrr_activeness))
print(np.max(mrr_exp))
print(np.max(mrr_earec))
print(np.max(mrr_srw))

In [0]:
print(f1_pop[0])
print(f1_activeness[0])
print(f1_exp[0])
print(f1_earec[0])
print(f1_srw[0])

In [0]:
print(recall_pop[0])
print(recall_activeness[0])
print(recall_exp[0])
print(recall_earec[0])
print(recall_srw[0])

In [0]:
print(precs_pop[0])
print(precs_activeness[0])
print(precs_exp[0])
print(precs_earec[0])
print(precs_srw[0])

In [0]:
fig, ax = plt.subplots(1, 1)
sns.lineplot(hypeparam, f1_exp, ax=ax, legend='full', label='Expertise')
# sns.lineplot(hypeparam, f1_pop, ax=ax, legend='full', label='Popularity')
sns.lineplot(hypeparam, f1_activeness, ax=ax, legend='full', label='Activeness')
sns.lineplot(hypeparam, f1_earec, ax=ax, legend='full', label='EARec + RWR')
sns.lineplot(hypeparam, f1_srw, ax=ax, legend='full', label='Bipartie SRW')

ax.set_xlabel('k')
ax.set_ylabel('F1@k')
ax.set_title('F1 vs. Number of predictions k')
fig.savefig('f1_plot_activeness.png')

In [0]:
    \text{Expertise} & .149 & .194 & .096 & .405 \\ \hline

In [0]:
    \text{\textbf{Bipartie Supervise Random Walk}} & \textbf{.149} & \textbf{.194} & \textbf{.096} & \textbf{.405} \\ \hline

In [0]:
fig, ax = plt.subplots(1, 1)
sns.lineplot(hypeparam, precs_exp, ax=ax)
sns.lineplot(hypeparam, precs_pop, ax=ax)
sns.lineplot(hypeparam, precs_earec, ax=ax)

In [0]:
# EARec Baseline
earec = joblib.load(os.path.join(home, 'EARec_best_result_12102019.joblib'))

In [0]:
precs_earec = []
recall_earec = []
f1_earec = []
mrr_earec = []
hypeparam = range(10, 101, 10)
for n in hypeparam:
    p, r, f, m = generate_results(earec, test_dict, top_k=n, verbose=False)
    precs_earec.append(p)
    recall_earec.append(r)
    f1_earec.append(f)
    mrr_earec.append(m)

In [0]:
# srw Baseline
seen = joblib.load(os.path.join(home, 'preds', 'seen_prs.joblib'))

In [0]:
srw_labels = joblib.load(os.path.join(home, 'preds', 'SRW_labels.joblib'))

In [0]:
srw_preds[65545][:10]

In [0]:
srw_labels[65545]

In [0]:
srw_preds = joblib.load(os.path.join(home, 'preds', 'SRW_predictions2.joblib'))

In [0]:
def generate_results_srw(pred_dict, test_dict, top_k=10, verbose=False):
    ap = 0.0
    ar = 0.0
    amrr = 0.0
    af1 = 0.0
    for test_pr_id, gt_test_reviewers in test_dict.items():
        pred = pred_dict[test_pr_id]
        ranked_candidates = np.array([cand for score, cand in sorted(pred, key=lambda x: x[0], reverse=True)])
        pred_set = ranked_candidates[:top_k]
        actual_set = set(gt_test_reviewers)
        precision = len(set(pred_set) & actual_set) / len(pred_set)
        recall = len(set(pred_set) & actual_set) / len(actual_set)
        f1 = (2 * precision * recall) / (precision + recall + 10**(-10))
        gt_rank = []
        mrr = 0.0
        for gt_reviewer in gt_test_reviewers:
            lookup = np.argwhere(ranked_candidates == gt_reviewer).tolist()
            if len(lookup) == 0:
                rank = 0
            else:
                rank = lookup[0][0] + 1
                mrr += 1/rank
            gt_rank.append(rank)
        amrr += mrr/len(gt_rank)
        ap += precision
        ar += recall
        af1 += f1
    if verbose:
        print("Top-%d-Average-Precision = %0.3f\nTop-%d-Average-Recall = %0.3f\nTop-%d-Average-F1 = %0.3f\nAverage-MRR = %0.3f" 
            %(top_k, ap/len(test_dict), top_k, ar/len(test_dict), top_k, af1/len(test_dict), amrr/len(test_dict)))
    return ap/len(test_dict), ar/len(test_dict), af1/len(test_dict), amrr/len(test_dict)

In [0]:
iter = 0
for pr_id, pred in srw_preds.items():
    print([cand for score, cand in sorted(pred, key=lambda x: x[0], reverse=True)][:10])
    if iter == 10:
        break
    iter += 1

In [0]:
precs_srw = []
recall_srw = []
f1_srw = []
mrr_srw = []
hypeparam = range(10, 101, 10)

for n in hypeparam:
    p, r, f, m = generate_results_srw(srw_preds, srw_labels, top_k=n)
    precs_srw.append(p)
    recall_srw.append(r)
    f1_srw.append(f)
    mrr_srw.append(m)

In [0]:
for n in hypeparam:
    _ = generate_results_srw(srw_preds, srw_labels, top_k=n, verbose=True)

In [0]:
proper_srw = defaultdict(list)
for pr_id, pred in srw.items():
    for score, cand in pred:
        proper_srw[pr_id].append((cand, score))

In [0]:
_ = generate_results(proper_srw, test_dict, top_k=n, verbose=True)

In [0]:
[cand for cand, score in sorted(proper_srw[65793], key=lambda x: x[1], reverse=True)][:10]

In [0]:
iter = 0
for pr_id, gt_test_reviewers in test_dict.items():
    pred = proper_srw[pr_id]
    ranked_candidates = [cand for cand, score in sorted(pred, key=lambda x: x[1], reverse=True)]
    pred_set = ranked_candidates[:10]
    actual_set = set(gt_test_reviewers)
    print(pr_id, pred_set, actual_set)
    if iter == 50:
        break
    iter += 1

In [0]:
for n in range(10, 101, 10):
    _ = generate_results(proper_srw, test_dict, top_k=n, verbose=True)

In [0]:
srw[67084]

In [0]:
iter = 0
for pr_id, pred in srw.items():
    print(pr_id)
    print(sorted(pred, key=lambda x: x[0], reverse=True))
    if iter == 10:
        break
    iter += 1

In [0]:
def find_username(user_id_list, comments_df, commenter_id='commenter_id', commenter_login='commenter_login'):
    return comments_df.loc[comments_df[commenter_id].isin(user_id_list), [commenter_id, commenter_login]].drop_duplicates().values

def dump_stuff(obj, fname):
    return joblib.dump(obj, os.path.join(home, '%s_v4_2018_only.joblib'%fname))

def load_stuff(fname):
    return joblib.load(os.path.join(home, '%s_v4_2018_only.joblib'%fname))

In [0]:
find_username([cand for cand, _ in popular_commenters_pred_dict[70431][:20]], train_comments_df)

In [0]:
find_username([980082], train_comments_df)

In [0]:
train_comments_df.shape

# Plots

In [0]:
def generate_pr_user_feat_df(core_df):
    combine = [core_df.groupby(['commenter_id', 'pr_id'])['comment_created_at'].min(),
               core_df.groupby(['commenter_id', 'pr_id'])['comment_created_at'].max(),
               core_df.groupby(['commenter_id', 'pr_id'])['comment'].count()]
    colnames = ['first_comment', 'last_comment', 'num_comment']
    pr_user_feat_df = pd.concat(combine, axis=1)
    pr_user_feat_df.columns = colnames
    user_gb_list = [core_df.groupby(['user_id', 'pr_id'])['username'].nunique().groupby('user_id').count(),
                    core_df.groupby(['commenter_id', 'pr_id'])['commenter_login'].nunique().groupby('commenter_id').count()]
    user_df = pd.concat(user_gb_list, axis=1)
    user_df.columns = ['num_pr_submitted', 'num_pr_commented']
    user_df.rename_axis('commenter_id')
    pr_user_feat_df = pr_user_feat_df.join(user_df, on='commenter_id').fillna(value=0)
    pr_user_feat_df = pr_user_feat_df.join(core_df.groupby('pr_id')['pr_created_at'].max(), on='pr_id')
    pr_user_feat_df['first_comment_age'] = (pr_user_feat_df['first_comment'] - pr_user_feat_df['pr_created_at']).apply(lambda x: x.total_seconds()/86400)
    pr_user_feat_df['last_comment_age'] = (pr_user_feat_df['last_comment'] - pr_user_feat_df['pr_created_at']).apply(lambda x: x.total_seconds()/86400)
    return pr_user_feat_df

train_pr_user_df = generate_pr_user_feat_df(train_comments_df)
test_pr_user_df = generate_pr_user_feat_df(test_comments_df)

In [0]:
train_pr_user_df

In [0]:
# PR comment KDE Plot
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.kdeplot(train_pr_user_df.groupby('pr_id')['num_comment'].sum(), ax=axes[0], label='Train Jan-June 2018')
sns.kdeplot(test_pr_user_df.groupby('pr_id')['num_comment'].sum(), ax=axes[0], label='Test June-Dec 2018', linestyle='--')
axes[0].set_xlabel('Number of Comments')
axes[0].set_ylabel('Number of PR Density')
axes[0].set_title('PR vs. Comments KDE Plot')

sns.kdeplot(train_pr_user_df.reset_index().groupby('pr_id')['commenter_id'].nunique(), ax=axes[1], label='Train Jan-June 2018')
sns.kdeplot(test_pr_user_df.reset_index().groupby('pr_id')['commenter_id'].nunique(), ax=axes[1], label='Test June-Dec 2018', linestyle='--')
axes[1].set_xlabel('Number of Users')
axes[1].set_title('PR vs. Users KDE Plot')

plt.tight_layout()
fig.savefig('./pr_comments_distplots.png')

In [0]:
# PR comment distribution
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.kdeplot(train_pr_user_df.groupby('commenter_id')['num_pr_submitted'].max(), ax=axes[0], label='Train Jan-June 2018')
sns.kdeplot(test_pr_user_df.groupby('commenter_id')['num_pr_submitted'].max(), ax=axes[0], label='Test June-Dec 2018', linestyle='--')
axes[0].set_xlabel('Number of PRs Submitted')
axes[0].set_ylabel('Number of Users Density')
axes[0].set_title('Users vs. PR Submitted KDE Plot')

sns.kdeplot(train_pr_user_df.groupby('commenter_id')['num_pr_commented'].max(), ax=axes[1], label='Train Jan-June 2018')
sns.kdeplot(test_pr_user_df.groupby('commenter_id')['num_pr_commented'].max(), ax=axes[1], label='Test June-Dec 2018', linestyle='--')
axes[1].set_xlabel('Number of PRs Commented')
axes[1].set_title('Users vs. PR Commented KDE Plot')

plt.tight_layout()

fig.savefig('./pr_user_distplots.png')

In [0]:
train_comments_df['month_created'] = train_comments_df.pr_created_at.apply(lambda x: x.month)

In [0]:
train_comments_df.groupby('month_created')['pr_id'].nunique()

In [0]:
test_comments_df['month_created'] = test_comments_df.pr_created_at.apply(lambda x: x.month)
test_comments_df.groupby('month_created')['pr_id'].nunique()

In [0]:
monthly_pr = pd.concat([train_comments_df.groupby('month_created')['pr_id'].nunique(), test_comments_df.groupby('month_created')['pr_id'].nunique()]).reset_index()
sns.barplot(x='month_created', y='pr_id', data=monthly_pr)

In [0]:
monthly_user = train_pr_user_df.groupby(['pr_id', 'commenter_id'])['first_comment'].min().reset_index()
monthly_user['first_comment_month'] = monthly_user.first_comment.apply(lambda x: x.month)

In [0]:
monthly_user_count = monthly_user.groupby(['first_comment_month'])['commenter_id'].nunique()

In [0]:
monthly_user_count

In [0]:
sns.kdeplot(train_comments_df.groupby(['commenter_id', 'pr_id'])['commenter_login'].nunique().groupby('commenter_id').count())

In [0]:
sns.kdeplot(train_comments_df.groupby(['user_id', 'pr_id'])['username'].nunique().groupby('user_id').count())

In [0]:
import matplotlib.pyplot as plt
plt.scatter(train_comments_df.groupby(['user_id', 'pr_id'])['username'].nunique().groupby('user_id').count())

In [0]:
train_comments_df.shape

In [0]:
len(test_dict)

In [0]:
test_comments_df.commenter_id.nunique()