In [1]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from datetime import datetime as dt
import pickle

In [56]:
def get_top_n(predictions, n=10):
    top_n_recommendations = defaultdict(list)
    for ruid, riid, true_r, est_r, _ in predictions:
        top_n_recommendations[ruid].append((riid, est_r))

    for ruid, user_ratings in top_n_recommendations.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_recommendations[ruid] = [riid for riid, _ in user_ratings[:n]]

    return top_n_recommendations

def get_mae(predictions):
    errors = defaultdict(list)
    for ruid, _, true_r, est_r, _ in predictions:
        errors[ruid].append(np.abs(true_r - est_r))
    return {ruid: np.mean(errors[ruid]) for ruid in errors.keys()}

def measure_influence_top_n(base_top_n, top_n):  
    jdists = []
    for ruid in top_n.keys():
        jsim = len(set(top_n[ruid]).intersection(base_top_n[ruid])) / len(set(top_n[ruid]).union(base_top_n[ruid]))
        jdists.append(1 - jsim)
    return np.mean(jdists)

def measure_influence_mae(base_mae, mae):
    dists = [mae[ruid] - base_mae[ruid] for ruid in mae.keys()]
    return np.mean(dists)

def get_top_n_mentors(model, n=10):
    nr_of_students = [(model.trainset.to_raw_uid(iuid), len(students)) for iuid, students in model.students.items()]
    top_n_mentors = sorted(nr_of_students, key=lambda t: t[1])[::-1][:n]
    top_n_mentors = [ruid for ruid, _ in top_n_mentors]
    
    return top_n_mentors

# Dataset
## Read dataset

In [90]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"], usecols=["user_id", "item_id", "rating"])
data_df["user_id"] = data_df["user_id"].map({b: a for a, b in enumerate(data_df["user_id"].unique())})
data_df["item_id"] = data_df["item_id"].map({b: a for a, b in enumerate(data_df["item_id"].unique())})

## Train- and testset

In [91]:
reader = Reader(rating_scale=(1, 5))
train_df, test_df = train_test_split(data_df, test_size=0.2)
dataset = Dataset.load_from_df(data_df, reader=reader)
raw_trainset = [(ruid, riid, r, None) for ruid, riid, r in train_df.to_records(index=False)]
raw_testset = [(ruid, riid, r, None) for ruid, riid, r in test_df.to_records(index=False)]
trainset = Dataset.construct_trainset(dataset, raw_trainset)
testset = Dataset.construct_testset(dataset, raw_testset)

# Model

In [92]:
Ks = [5, 10, 30]

In [132]:
sim = UserKNN().compute_similarities(trainset, min_support=1)
pop = UserKNN().compute_popularities(trainset)
gain = UserKNN().compute_gain(trainset)

## Baseline models

In [133]:
base_top_n = defaultdict(list)
base_mae = defaultdict(list)
top_n_mentors = defaultdict(list)
for k in Ks:
    # UserKNN
    model = UserKNN(k=k, precomputed_sim=sim)
    model.fit(trainset)
    predictions = model.test(testset)
    base_top_n["UserKNN"].append(get_top_n(predictions))
    base_mae["UserKNN"].append(get_mae(predictions))
    top_n_mentors["UserKNN"].append(get_top_n_mentors(model))
    
    # UserKNN + reuse
    model = UserKNN(k=k, precomputed_sim=sim, reuse=True)
    model.fit(trainset)
    predictions = model.test(testset)
    base_top_n["UserKNN + Reuse"].append(get_top_n(predictions))
    base_mae["UserKNN + Reuse"].append(get_mae(predictions))
    top_n_mentors["UserKNN + Reuse"].append(get_top_n_mentors(model))
    
    # Popularity
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, tau_2=0.5)
    model.fit(trainset)
    predictions = model.test(testset)
    base_top_n["Popularity"].append(get_top_n(predictions))
    base_mae["Popularity"].append(get_mae(predictions))
    top_n_mentors["Popularity"].append(get_top_n_mentors(model))
    
    # Popularity + Reuse
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, tau_2=0.5, reuse=True)
    model.fit(trainset)
    predictions = model.test(testset)
    base_top_n["Popularity + Reuse"].append(get_top_n(predictions))
    base_mae["Popularity + Reuse"].append(get_mae(predictions))
    top_n_mentors["Popularity + Reuse"].append(get_top_n_mentors(model))
    
    # Gain
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, precomputed_gain=gain, tau_4=0.5)
    model.fit(trainset)
    predictions = model.test(testset)
    base_top_n["Gain"].append(get_top_n(predictions))
    base_mae["Gain"].append(get_mae(predictions))
    top_n_mentors["Gain"].append(get_top_n_mentors(model))
    
    # Gain + reuse
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, precomputed_gain=gain, tau_4=0.5, reuse=True)
    model.fit(trainset)
    predictions = model.test(testset)
    base_top_n["Gain + Reuse"].append(get_top_n(predictions))
    base_mae["Gain + Reuse"].append(get_mae(predictions))
    top_n_mentors["Gain + Reuse"].append(get_top_n_mentors(model))

In [134]:
all_mentors = set()
for mentors in top_n_mentors.values():
    all_mentors = all_mentors.union(set(np.ravel(mentors)))

In [137]:
influence_top_n = defaultdict(list)
influence_mae = defaultdict(list)

all_users = set([trainset.to_raw_uid(iuid) for iuid in trainset.all_users()])
no_mentors = set(np.random.choice(list(all_users.difference(all_mentors)), replace=False, size=100))

starttime = dt.now()
for ruid in no_mentors.union(all_mentors):
    train_wo_df = train_df[train_df["user_id"] != ruid]
    raw_trainset_wo = [(ruid, riid, r, None) for ruid, riid, r in train_wo_df.to_records(index=False)]
    trainset_wo = Dataset.construct_trainset(dataset, raw_trainset_wo)
    
    sim = UserKNN().compute_similarities(trainset_wo, min_support=1)
    pop = UserKNN().compute_popularities(trainset_wo)
    gain = UserKNN().compute_gain(trainset_wo)
    
    for k in Ks:
        # UserKNN
        if ruid in top_n_mentors["UserKNN"][Ks.index(k)] or ruid in no_mentors:
            model = UserKNN(k=k, precomputed_sim=sim)
            model.fit(trainset_wo)
            predictions = model.test(testset)
        
            ruid_influence_top_n = measure_influence_top_n(base_top_n["UserKNN"][Ks.index(k)], get_top_n(predictions))
            if len(influence_top_n["UserKNN"]) < len(Ks):
                influence_top_n["UserKNN"] = [[] for _ in Ks]
                influence_top_n["UserKNN"][Ks.index(k)] = [(ruid, ruid_influence_top_n)]
            else:
                influence_top_n["UserKNN"][Ks.index(k)].append((ruid, ruid_influence_top_n))
                
            ruid_influence_mae = measure_influence_mae(base_mae["UserKNN"][Ks.index(k)], get_mae(predictions))
            if len(influence_mae["UserKNN"]) < len(Ks):
                influence_mae["UserKNN"] = [[] for _ in Ks]
                influence_mae["UserKNN"][Ks.index(k)] = [(ruid, ruid_influence_mae)]
            else:
                influence_mae["UserKNN"][Ks.index(k)].append((ruid, ruid_influence_mae))
        
        # UserKNN + Reuse
        if ruid in top_n_mentors["UserKNN + Reuse"][Ks.index(k)] or ruid in no_mentors:
            model = UserKNN(k=k, precomputed_sim=sim, reuse=True)
            model.fit(trainset_wo)
            predictions = model.test(testset)
        
            ruid_influence_top_n = measure_influence_top_n(base_top_n["UserKNN + Reuse"][Ks.index(k)], get_top_n(predictions))
            if len(influence_top_n["UserKNN + Reuse"]) < len(Ks):
                influence_top_n["UserKNN + Reuse"] = [[] for _ in Ks]
                influence_top_n["UserKNN + Reuse"][Ks.index(k)] = [(ruid, ruid_influence_top_n)]
            else:
                influence_top_n["UserKNN + Reuse"][Ks.index(k)].append((ruid, ruid_influence_top_n))

            ruid_influence_mae = measure_influence_mae(base_mae["UserKNN + Reuse"][Ks.index(k)], get_mae(predictions))
            if len(influence_mae["UserKNN + Reuse"]) < len(Ks):
                influence_mae["UserKNN + Reuse"] = [[] for _ in Ks]
                influence_mae["UserKNN + Reuse"][Ks.index(k)] = [(ruid, ruid_influence_mae)]
            else:
                influence_mae["UserKNN + Reuse"][Ks.index(k)].append((ruid, ruid_influence_mae))
        
        # Popularity
        if ruid in top_n_mentors["Popularity"][Ks.index(k)] or ruid in no_mentors:
            model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, tau_2=0.5)
            model.fit(trainset_wo)
            predictions = model.test(testset)
        
            ruid_influence_top_n = measure_influence_top_n(base_top_n["Popularity"][Ks.index(k)], get_top_n(predictions))
            if len(influence_top_n["Popularity"]) < len(Ks):
                influence_top_n["Popularity"] = [[] for _ in Ks]
                influence_top_n["Popularity"][Ks.index(k)] = [(ruid, ruid_influence_top_n)]
            else:
                influence_top_n["Popularity"][Ks.index(k)].append((ruid, ruid_influence_top_n))

            ruid_influence_mae = measure_influence_mae(base_mae["Popularity"][Ks.index(k)], get_mae(predictions))
            if len(influence_mae["Popularity"]) < len(Ks):
                influence_mae["Popularity"] = [[] for _ in Ks]
                influence_mae["Popularity"][Ks.index(k)] = [(ruid, ruid_influence_mae)]
            else:
                influence_mae["Popularity"][Ks.index(k)].append((ruid, ruid_influence_mae))
        
        # Popularity + Reuse
        if ruid in top_n_mentors["Popularity + Reuse"][Ks.index(k)] or ruid in no_mentors:
            model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, tau_2=0.5, reuse=True)
            model.fit(trainset_wo)
            predictions = model.test(testset)
        
            ruid_influence_top_n = measure_influence_top_n(base_top_n["Popularity + Reuse"][Ks.index(k)], get_top_n(predictions))
            if len(influence_top_n["Popularity + Reuse"]) < len(Ks):
                influence_top_n["Popularity + Reuse"] = [[] for _ in Ks]
                influence_top_n["Popularity + Reuse"][Ks.index(k)] = [(ruid, ruid_influence_top_n)]
            else:
                influence_top_n["Popularity + Reuse"][Ks.index(k)].append((ruid, ruid_influence_top_n))

            ruid_influence_mae = measure_influence_mae(base_mae["Popularity + Reuse"][Ks.index(k)], get_mae(predictions))
            if len(influence_mae["Popularity + Reuse"]) < len(Ks):
                influence_mae["Popularity + Reuse"] = [[] for _ in Ks]
                influence_mae["Popularity + Reuse"][Ks.index(k)] = [(ruid, ruid_influence_mae)]
            else:
                influence_mae["Popularity + Reuse"][Ks.index(k)].append((ruid, ruid_influence_mae))
        
        # Gain
        if ruid in top_n_mentors["Gain"][Ks.index(k)] or ruid in no_mentors:
            model = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain, tau_4=0.5)
            model.fit(trainset_wo)
            predictions = model.test(testset)
        
            ruid_influence_top_n = measure_influence_top_n(base_top_n["Gain"][Ks.index(k)], get_top_n(predictions))
            if len(influence_top_n["Gain"]) < len(Ks):
                influence_top_n["Gain"] = [[] for _ in Ks]
                influence_top_n["Gain"][Ks.index(k)] = [(ruid, ruid_influence_top_n)]
            else:
                influence_top_n["Gain"][Ks.index(k)].append((ruid, ruid_influence_top_n))

            ruid_influence_mae = measure_influence_mae(base_mae["Gain"][Ks.index(k)], get_mae(predictions))
            if len(influence_mae["Gain"]) < len(Ks):
                influence_mae["Gain"] = [[] for _ in Ks]
                influence_mae["Gain"][Ks.index(k)] = [(ruid, ruid_influence_mae)]
            else:
                influence_mae["Gain"][Ks.index(k)].append((ruid, ruid_influence_mae))
        
        # Gain + Reuse
        if ruid in top_n_mentors["Gain + Reuse"][Ks.index(k)] or ruid in no_mentors:
            model = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain, tau_4=0.5, reuse=True)
            model.fit(trainset_wo)
            predictions = model.test(testset)
        
            ruid_influence_top_n = measure_influence_top_n(base_top_n["Gain + Reuse"][Ks.index(k)], get_top_n(predictions))
            if len(influence_top_n["Gain + Reuse"]) < len(Ks):
                influence_top_n["Gain + Reuse"] = [[] for _ in Ks]
                influence_top_n["Gain + Reuse"][Ks.index(k)] = [(ruid, ruid_influence_top_n)]
            else:
                influence_top_n["Gain + Reuse"][Ks.index(k)].append((ruid, ruid_influence_top_n))

            ruid_influence_mae = measure_influence_mae(base_mae["Gain + Reuse"][Ks.index(k)], get_mae(predictions))
            if len(influence_mae["Gain + Reuse"]) < len(Ks):
                influence_mae["Gain + Reuse"] = [[] for _ in Ks]
                influence_mae["Gain + Reuse"][Ks.index(k)] = [(ruid, ruid_influence_mae)]
            else:
                influence_mae["Gain + Reuse"][Ks.index(k)].append((ruid, ruid_influence_mae))
        
    print("User id: %d, Time elapsed: %s" % (ruid, dt.now() - starttime))

User id: 8, Time elapsed: 0:02:25.560089
User id: 777, Time elapsed: 0:04:51.775546
User id: 10, Time elapsed: 0:05:26.596507
User id: 267, Time elapsed: 0:05:52.115092
User id: 525, Time elapsed: 0:08:39.629677
User id: 766, Time elapsed: 0:11:16.504791
User id: 13, Time elapsed: 0:11:40.083069
User id: 17, Time elapsed: 0:13:51.766741
User id: 19, Time elapsed: 0:16:07.075688
User id: 276, Time elapsed: 0:18:16.962943
User id: 532, Time elapsed: 0:19:24.547771
User id: 795, Time elapsed: 0:21:38.675902
User id: 31, Time elapsed: 0:24:08.923954
User id: 544, Time elapsed: 0:26:39.861004
User id: 33, Time elapsed: 0:29:13.617194
User id: 288, Time elapsed: 0:31:49.754727
User id: 37, Time elapsed: 0:34:32.867067
User id: 39, Time elapsed: 0:37:13.686184
User id: 808, Time elapsed: 0:40:01.248195
User id: 554, Time elapsed: 0:42:43.429205
User id: 555, Time elapsed: 0:45:15.818427
User id: 301, Time elapsed: 0:47:30.684874
User id: 815, Time elapsed: 0:49:48.869245
User id: 303, Time el

In [251]:
influence_mentors = []
influence_nomentors = []
for ruid, i in influence_mae["UserKNN"][Ks.index(5)]:
    if ruid in top_n_mentors["UserKNN"][Ks.index(5)]:
        influence_mentors.append(i)
    else:
        influence_nomentors.append(i)
np.mean(influence_mentors), np.mean(influence_nomentors)

(0.002654839122794252, 0.001636723241004774)

In [252]:
influence_mentors = []
influence_nomentors = []
for ruid, i in influence_mae["Gain + Reuse"][Ks.index(5)]:
    if ruid in top_n_mentors["Gain + Reuse"][Ks.index(5)]:
        influence_mentors.append(i)
    else:
        influence_nomentors.append(i)
np.mean(influence_mentors), np.mean(influence_nomentors)

(0.00033861781227871617, 5.731552720721594e-05)

In [9]:
influence_mae = pickle.load(open("results/ml-100k/influence/influence_mae.pkl", "rb"))
top_n_mentors = pickle.load(open("results/ml-100k/influence/top_10_mentors.pkl", "rb"))
influence_top_n = pickle.load(open("results/ml-100k/influence/influence_top_n.pkl", "rb"))  
Ks = pickle.load(open("results/ml-100k/influence/k.pkl", "rb"))  

In [10]:
def f_mae(method, k):
    influence_mentors = []
    influence_nomentors = []
    for ruid, i in influence_mae[method][Ks.index(k)]:
        if ruid in top_n_mentors[method][Ks.index(k)]:
            influence_mentors.append(i)
        else:
            influence_nomentors.append(i)
            
    return np.mean(influence_mentors), np.mean(influence_nomentors)

def f_top_n(method, k):
    influence_mentors = []
    influence_nomentors = []
    for ruid, i in influence_top_n[method][Ks.index(k)]:
        if ruid in top_n_mentors[method][Ks.index(k)]:
            influence_mentors.append(i)
        else:
            influence_nomentors.append(i)
            
    return np.mean(influence_mentors), np.mean(influence_nomentors)

In [11]:
avg_influence_mentors_mae = defaultdict(list)
avg_influence_nomentors_mae = defaultdict(list)

for k in Ks:
    influence_mentors, influence_nomentors = f_mae("UserKNN", k=k)
    avg_influence_mentors_mae["UserKNN"].append(np.mean(influence_mentors))
    avg_influence_nomentors_mae["UserKNN"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_mae("UserKNN + Reuse", k=k)
    avg_influence_mentors_mae["UserKNN + Reuse"].append(np.mean(influence_mentors))
    avg_influence_nomentors_mae["UserKNN + Reuse"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_mae("Popularity", k=k)
    avg_influence_mentors_mae["Popularity"].append(np.mean(influence_mentors))
    avg_influence_nomentors_mae["Popularity"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_mae("Popularity + Reuse", k=k)
    avg_influence_mentors_mae["Popularity + Reuse"].append(np.mean(influence_mentors))
    avg_influence_nomentors_mae["Popularity + Reuse"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_mae("Gain", k=k)
    avg_influence_mentors_mae["Gain"].append(np.mean(influence_mentors))
    avg_influence_nomentors_mae["Gain"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_mae("Gain + Reuse", k=k)
    avg_influence_mentors_mae["Gain + Reuse"].append(np.mean(influence_mentors))
    avg_influence_nomentors_mae["Gain + Reuse"].append(np.mean(influence_nomentors))
    
    
avg_influence_mentors_top_n = defaultdict(list)
avg_influence_nomentors_top_n = defaultdict(list)

for k in Ks:
    influence_mentors, influence_nomentors = f_top_n("UserKNN", k=k)
    avg_influence_mentors_top_n["UserKNN"].append(np.mean(influence_mentors))
    avg_influence_nomentors_top_n["UserKNN"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_top_n("UserKNN + Reuse", k=k)
    avg_influence_mentors_top_n["UserKNN + Reuse"].append(np.mean(influence_mentors))
    avg_influence_nomentors_top_n["UserKNN + Reuse"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_top_n("Popularity", k=k)
    avg_influence_mentors_top_n["Popularity"].append(np.mean(influence_mentors))
    avg_influence_nomentors_top_n["Popularity"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_top_n("Popularity + Reuse", k=k)
    avg_influence_mentors_top_n["Popularity + Reuse"].append(np.mean(influence_mentors))
    avg_influence_nomentors_top_n["Popularity + Reuse"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_top_n("Gain", k=k)
    avg_influence_mentors_top_n["Gain"].append(np.mean(influence_mentors))
    avg_influence_nomentors_top_n["Gain"].append(np.mean(influence_nomentors))
    
    influence_mentors, influence_nomentors = f_top_n("Gain + Reuse", k=k)
    avg_influence_mentors_top_n["Gain + Reuse"].append(np.mean(influence_mentors))
    avg_influence_nomentors_top_n["Gain + Reuse"].append(np.mean(influence_nomentors))

In [12]:
%matplotlib qt

In [15]:
print(Ks)

[5, 10, 15, 30]


In [22]:
barWidth = 0.1
 
bars1 = avg_influence_mentors_mae["UserKNN"]
bars2 = avg_influence_mentors_mae["UserKNN + Reuse"]
bars3 = avg_influence_mentors_mae["Popularity"]
bars4 = avg_influence_mentors_mae["Popularity + Reuse"]
bars5 = avg_influence_mentors_mae["Gain"]
bars6 = avg_influence_mentors_mae["Gain + Reuse"]
 
# The x position of bars
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
r6 = [x + barWidth for x in r5]
 
plt.bar(r1, bars1, width = barWidth, color = 'C0', alpha=0.5, label='UserKNN')
plt.bar(r3, bars3, width = barWidth, color = 'C1', alpha=0.5, label='Popularity')
plt.bar(r5, bars5, width = barWidth, color = 'C2', alpha=0.5, label='Gain')
plt.bar(r2, bars2, width = barWidth, color = 'C0', label='UserKNN + Reuse')
plt.bar(r4, bars4, width = barWidth, color = 'C1', label='Popularity + Reuse')
plt.bar(r6, bars6, width = barWidth, color = 'C2', label='Gain + Reuse')

# general layout
plt.axhline(y=0, linestyle="dashed", color="grey")
plt.xticks(r3 + np.array(barWidth/2), [r"$k=5$", r"$k=10$", r"$k=15$", r"$k=30$"])
plt.ylabel("Change in MAE")
plt.legend(ncol=2)
plt.title("Influence of top-10 mentors")
 
# Show graphic
plt.show()

In [23]:
barWidth = 0.1
 
bars1 = avg_influence_nomentors_mae["UserKNN"]
bars2 = avg_influence_nomentors_mae["UserKNN + Reuse"]
bars3 = avg_influence_nomentors_mae["Popularity"]
bars4 = avg_influence_nomentors_mae["Popularity + Reuse"]
bars5 = avg_influence_nomentors_mae["Gain"]
bars6 = avg_influence_nomentors_mae["Gain + Reuse"]
 
# The x position of bars
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
r6 = [x + barWidth for x in r5]
 
plt.bar(r1, bars1, width = barWidth, color = 'C0', alpha=0.5, label='UserKNN')
plt.bar(r3, bars3, width = barWidth, color = 'C1', alpha=0.5, label='Popularity')
plt.bar(r5, bars5, width = barWidth, color = 'C2', alpha=0.5, label='Gain')
plt.bar(r2, bars2, width = barWidth, color = 'C0', label='UserKNN + Reuse')
plt.bar(r4, bars4, width = barWidth, color = 'C1', label='Popularity + Reuse')
plt.bar(r6, bars6, width = barWidth, color = 'C2', label='Gain + Reuse')

# general layout
plt.axhline(y=0, linestyle="dashed", color="grey")
plt.xticks(r3 + np.array(barWidth/2), [r"$k=5$", r"$k=10$", r"$k=15$", r"$k=30$"])
plt.ylabel("Change in MAE")
plt.legend(ncol=2)
plt.title("Influence of all users")
 
# Show graphic
plt.show()

In [20]:
barWidth = 0.1
 
bars1 = avg_influence_mentors_top_n["UserKNN"]
bars2 = avg_influence_mentors_top_n["UserKNN + Reuse"]
bars3 = avg_influence_mentors_top_n["Popularity"]
bars4 = avg_influence_mentors_top_n["Popularity + Reuse"]
bars5 = avg_influence_mentors_top_n["Gain"]
bars6 = avg_influence_mentors_top_n["Gain + Reuse"]
 
# The x position of bars
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
r6 = [x + barWidth for x in r5]
 
plt.bar(r1, bars1, width = barWidth, color = 'C0', alpha=0.5, label='UserKNN')
plt.bar(r3, bars3, width = barWidth, color = 'C1', alpha=0.5, label='Popularity')
plt.bar(r5, bars5, width = barWidth, color = 'C2', alpha=0.5, label='Gain')
plt.bar(r2, bars2, width = barWidth, color = 'C0', label='UserKNN + Reuse')
plt.bar(r4, bars4, width = barWidth, color = 'C1', label='Popularity + Reuse')
plt.bar(r6, bars6, width = barWidth, color = 'C2', label='Gain + Reuse')

# general layout
plt.axhline(y=0, linestyle="dashed", color="grey")
plt.xticks(r3 + np.array(barWidth/2), [r"$k=5$", r"$k=10$", r"$k=15$", r"$k=30$"])
plt.ylabel("Jaccard Distance")
plt.legend(ncol=2)
plt.title("Influence of top-10 mentors")
 
# Show graphic
plt.show()

In [21]:
barWidth = 0.1
 
bars1 = avg_influence_nomentors_top_n["UserKNN"]
bars2 = avg_influence_nomentors_top_n["UserKNN + Reuse"]
bars3 = avg_influence_nomentors_top_n["Popularity"]
bars4 = avg_influence_nomentors_top_n["Popularity + Reuse"]
bars5 = avg_influence_nomentors_top_n["Gain"]
bars6 = avg_influence_nomentors_top_n["Gain + Reuse"]
 
# The x position of bars
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
r6 = [x + barWidth for x in r5]
 
plt.bar(r1, bars1, width = barWidth, color = 'C0', alpha=0.5, label='UserKNN')
plt.bar(r3, bars3, width = barWidth, color = 'C1', alpha=0.5, label='Popularity')
plt.bar(r5, bars5, width = barWidth, color = 'C2', alpha=0.5, label='Gain')
plt.bar(r2, bars2, width = barWidth, color = 'C0', label='UserKNN + Reuse')
plt.bar(r4, bars4, width = barWidth, color = 'C1', label='Popularity + Reuse')
plt.bar(r6, bars6, width = barWidth, color = 'C2', label='Gain + Reuse')

# general layout
plt.axhline(y=0, linestyle="dashed", color="grey")
plt.xticks(r3 + np.array(barWidth/2), [r"$k=5$", r"$k=10$", r"$k=15$", r"$k=30$"])
plt.ylabel("Jaccard Distance")
plt.legend(ncol=2)
plt.title("Influence of all users")
 
# Show graphic
plt.show()