In [1]:
import glob
import pandas as pd
# %matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")
import scipy as sp

In [3]:
user_groups = "005"
results_path = "results/" + user_groups + "/"
datasets = ['Epinion', 'MovieLens', 'BookCrossing', 'AmazonOffice', 'AmazonToy', 'Gowalla', 'LastFM', 'Foursquare']

In [4]:
for dataset in datasets:
    print(dataset)
    ds_user_act = pd.read_csv(f'datasets/{dataset}/groups/users/{user_groups}/active_ids.txt', names=['uid'])
    ds_user_inact = pd.read_csv(f'datasets/{dataset}/groups/users/{user_groups}/inactive_ids.txt')

    # print(f"{dataset} > adv. users: {ds_user_act.shape[0]}, disadv. users: {ds_user_inact.shape[0]} ")
    print(f"& {ds_user_act.shape[0]} & {ds_user_inact.shape[0]}")


Epinion
& 134 & 2542
MovieLens
& 47 & 895
BookCrossing
& 57 & 1078
AmazonOffice
& 122 & 2325
AmazonToy
& 108 & 2061
Gowalla
& 56 & 1073
LastFM
& 90 & 1706
Foursquare
& 78 & 1489


In [None]:
results_files = []

for dataset in datasets:
    for file in glob.glob(results_path + dataset + "/*.csv"):
        results_files.append(file)
print(f"No. of files: {len(results_files)}")

In [None]:
all_dfs = []

for result_file in results_files:
    result_df = pd.read_csv(result_file, sep=",")
    all_dfs.append(result_df)

# concat all of the dataframes
results_df = pd.concat(all_dfs)

In [None]:
# sort results dataframe
results_df = results_df.sort_values(["Dataset", "Model", 'Type'], ascending = (False, True, False))

In [None]:
results_df.head()

In [None]:
def UGF(row):
    return round(row['ndcg_ACT'] - row['ndcg_INACT'], 4)

In [None]:
results_df['UGF'] = results_df.apply(lambda row: UGF(row), axis=1)

In [None]:
results_df.head()

In [None]:
ds_group_results_df = results_df.groupby('Dataset')

def main_results_extraction(dataset='Epinion', model='NeuMF'):
    global heatmap_df
    global boxplot_df
    global boxplot_UGF_df
    ml_group_results_df = ds_group_results_df.get_group(dataset).groupby('Model')
    model_df = ml_group_results_df.get_group(model)
    N_df = model_df[(model_df.Type == 'N')]
    # print(model)
    C_df = model_df[(model_df.Type == 'C') & (model_df.UGF > 0)]
    # print(C_df)
    if C_df.empty == True:
        # all UGfs are negative
        C_df = model_df[(model_df.Type == 'C') & (model_df.UGF == model_df.UGF.max())]
    else:
        C_df = C_df[C_df.UGF == C_df.UGF.min()]

    _UGF_N = round(N_df.UGF.values[0], 4)
    _UGF_C = round(C_df.UGF.values[0], 4)

    _NDCG_N = round(N_df.ndcg_ALL.values[0], 4)
    _NDCG_C = round(C_df.ndcg_ALL.values[0], 4)

    _Nov_N = round(N_df.Nov_ALL.values[0], 4)
    _Nov_C = round(C_df.Nov_ALL.values[0], 4)

    _DeltaGAP_N = round(abs(N_df.Active_GAP.values[0] + N_df.Inactive_Gap.values[0]), 4) / 2
    _DeltaGAP_C = round(abs(C_df.Active_GAP.values[0] + C_df.Inactive_Gap.values[0]), 4) / 2

    # _DeltaGAP_N_Imp = _DeltaGAP_N

    print(f"\multirow{{2}}{{*}}{{{N_df.Model.values[0]}}} & Org. & {round(N_df.ndcg_ALL.values[0], 4)} & {round(N_df.ndcg_ACT.values[0], 4)} & {round(N_df.ndcg_INACT.values[0], 4)} & {round(N_df.UGF.values[0], 4)} & {round(abs((_UGF_N - _UGF_N) / _UGF_N) * 100, 2)} && {round(N_df.Nov_ALL.values[0], 4)} & {N_df.Cov_ALL.values[0]} & {round(N_df.Short_Items.values[0], 4)} & {round(N_df.Long_Items.values[0], 4)} & {round(N_df.Active_GAP.values[0], 4)} & {round(N_df.Inactive_Gap.values[0], 4)} \\\\")
    print(f"                       & Fair & {round(C_df.ndcg_ALL.values[0], 4)} & {round(C_df.ndcg_ACT.values[0], 4)} & {round(C_df.ndcg_INACT.values[0], 4)} & {round(C_df.UGF.values[0], 4)} & {round(abs((_UGF_C - _UGF_N) / _UGF_N) * 100, 2)} && {round(C_df.Nov_ALL.values[0], 4)} & {C_df.Cov_ALL.values[0]} & {round(C_df.Short_Items.values[0], 4)} & {round(C_df.Long_Items.values[0], 4)} & {round(C_df.Active_GAP.values[0], 4)} & {round(C_df.Inactive_Gap.values[0], 4)} \\\ \hline")
    result_table.write(f"{N_df.Model.values[0]},Org.,{round(N_df.ndcg_ALL.values[0], 4)},{round(N_df.ndcg_ACT.values[0], 4)},{round(N_df.ndcg_INACT.values[0], 4)},{round(N_df.UGF.values[0], 4)},{round(abs((_UGF_N - _UGF_N) / _UGF_N) * 100, 2)},{round(N_df.Nov_ALL.values[0], 4)},{N_df.Cov_ALL.values[0]},{round(N_df.Short_Items.values[0], 4)},{round(N_df.Long_Items.values[0], 4)},{round(N_df.Active_GAP.values[0], 4)},{round(N_df.Inactive_Gap.values[0], 4)} \n")
    result_table.write(f"{C_df.Model.values[0]},Fair,{round(C_df.ndcg_ALL.values[0], 4)},{round(C_df.ndcg_ACT.values[0], 4)},{round(C_df.ndcg_INACT.values[0], 4)},{round(C_df.UGF.values[0], 4)},{round(abs((_UGF_C - _UGF_N) / _UGF_N) * 100, 2)},{round(C_df.Nov_ALL.values[0], 4)},{C_df.Cov_ALL.values[0]},{round(C_df.Short_Items.values[0], 4)},{round(C_df.Long_Items.values[0], 4)},{round(C_df.Active_GAP.values[0], 4)},{round(C_df.Inactive_Gap.values[0], 4)} \n")
    # heatmap_df = heatmap_df.append({'Dataset': model_df.Dataset.values[0], 'Group': 'All', 'Model': model_df.Model.values[0], 'Precision': model_df.Pre_ALL.values[0]}, ignore_index=True)
    # N
    heatmap_df = heatmap_df.append({'Dataset': model_df.Dataset.values[0],'Group': 'Org.', 'Model': model_df.Model.values[0], 'UGF': (N_df.Pre_ACT.values[0] - N_df.Pre_INACT.values[0])}, ignore_index=True)
    heatmap_df = heatmap_df.append({'Dataset': model_df.Dataset.values[0],'Group': 'Fair', 'Model': model_df.Model.values[0], 'UGF': (C_df.Pre_ACT.values[0] - C_df.Pre_INACT.values[0])}, ignore_index=True)
    # heatmap_df = heatmap_df.append({'Dataset': model_df.Dataset.values[0],'Group': 'Fair (Act.)', 'Model': model_df.Model.values[0], 'Precision': C_df.Pre_ACT.values[0]}, ignore_index=True)
    # heatmap_df = heatmap_df.append({'Dataset': model_df.Dataset.values[0],'Group': 'Fair (Inact.)', 'Model': model_df.Model.values[0], 'Precision': C_df.Pre_INACT.values[0]}, ignore_index=True)

    boxplot_df = boxplot_df.append({'Dataset': model_df.Dataset.values[0], 'Model': model_df.Model.values[0], 'Type': 'Original', 'UGF': N_df.UGF.values[0], 'impNDCG': round(abs((_NDCG_N - _NDCG_N) / _NDCG_N) * 100, 2), 'impNov': round(abs((_Nov_N - _Nov_N) / _Nov_N) * 100, 2), 'Cov': N_df.Cov_ALL.values[0], 'DeltaGAP': round(N_df.Active_GAP.values[0] - N_df.Inactive_Gap.values[0], 4), 'avgDeltaGap': _DeltaGAP_N, 'impDeltaGap':round((_DeltaGAP_N - _DeltaGAP_N) / _DeltaGAP_N, 5), 'impUGF': round(abs((_UGF_N - _UGF_N) / _UGF_N) * 100, 2), 'cntLongItems': round(N_df.Long_Items.values[0], 4)}, ignore_index=True)
    boxplot_df = boxplot_df.append({'Dataset': model_df.Dataset.values[0], 'Model': model_df.Model.values[0], 'Type': 'Fair', 'UGF': C_df.UGF.values[0], 'impNDCG': round(abs((_NDCG_C - _NDCG_N) / _NDCG_N) * 100, 2), 'impNov': round(abs((_Nov_C - _Nov_N) / _Nov_N) * 100, 2), 'Cov': C_df.Cov_ALL.values[0], 'DeltaGAP': round(C_df.Active_GAP.values[0] - C_df.Inactive_Gap.values[0], 4), 'avgDeltaGap': _DeltaGAP_C, 'impDeltaGap': round((_DeltaGAP_C - _DeltaGAP_N) / _DeltaGAP_N, 5), 'impUGF': round(abs((_UGF_C - _UGF_N) / _UGF_N) * 100, 2), 'cntLongItems': round(C_df.Long_Items.values[0], 4)}, ignore_index=True)

    boxplot_UGF_df = boxplot_UGF_df.append({'Dataset': model_df.Dataset.values[0], 'Model': model_df.Model.values[0], 'UGF-Org': N_df.UGF.values[0], 'UGF-Fair': round(abs((_UGF_C - _UGF_N)), 2)}, ignore_index=True)

In [None]:
heatmap_df = pd.DataFrame(columns=['Dataset', 'Group', 'Model', 'UGF', ])
boxplot_df = pd.DataFrame(columns=['Dataset', 'Model','Type', 'UGF', 'impNDCG', 'impNov', 'Cov', 'DeltaGAP', 'avgDeltaGap','impDeltaGap', 'impUGF', 'cntLongItems'])
boxplot_UGF_df = pd.DataFrame(columns=['Dataset', 'Model', 'UGF-Org', 'UGF-Fair'])

Models = ['MostPop', 'BPR', 'PF', 'WMF', 'NeuMF', 'VAECF']
for dataset in datasets:
    result_table = open(f"tabels/{user_groups}/result_table_{dataset}.csv", 'w')
    result_table.write("Model,Type,All,Act,Inact,UGF,Improv,Nov,Cov,Short,Long,GAPAct,GAPInact \n")
    print(f"> Dataset: {dataset}")
    for model in Models:
        main_results_extraction(dataset=dataset, model=model)
    result_table.close()

In [None]:
boxplot_df[(boxplot_df.Type == 'Original') & (boxplot_df.Model == 'MostPop')]

In [None]:
for model in Models:
    # print(model)
    longitems_df = boxplot_df[(boxplot_df.Type == 'Original') & (boxplot_df.Model == model)]['cntLongItems']
    # print(longitems_df)
    print(f"{model}: {np.mean(longitems_df)}")

In [None]:
impUGF_fair_df = boxplot_df[boxplot_df.Type == 'Fair']['impUGF']
UGF_org_df = boxplot_df[boxplot_df.Type == 'Original']['UGF']

In [None]:
# impUGF_fair_df.merge(UGF_org_df)
sns.regplot(x=impUGF_fair_df, y=UGF_org_df)

v, p = sp.stats.pearsonr(impUGF_fair_df, UGF_org_df)
print(round(v, 5) , round(p, 7))

In [None]:
avg_df = boxplot_df[boxplot_df.Type == 'Fair']

In [None]:
np.mean(avg_df.impNDCG), np.mean(avg_df.impNov), np.mean(avg_df.impUGF), np.mean(avg_df.impDeltaGap), np.mean(avg_df.avgDeltaGap)

In [None]:
# %improvmenet for each mode on original and fair type

def get_model_average_UGF(type='Original'):
    boxplot_type_df = boxplot_df[boxplot_df.Type == type].groupby('Model')
    print(f"The average of UGF on each model for the {type} version ...")
    Models = ['MostPop', 'BPR', 'PF', 'WMF', 'NeuMF', 'VAECF']
    for eachmodel in Models:
        print(f"{eachmodel}: {round(np.mean(boxplot_type_df.get_group(eachmodel)['impUGF']), 4)}")

for type in ['Original', 'Fair']:
    get_model_average_UGF(type=type)

In [None]:
sns.regplot(x="UGF-Org", y='UGF-Fair', data=boxplot_UGF_df[boxplot_UGF_df.Dataset == 'LastFM'])

In [None]:
# sort results dataframe
heatmap_df = heatmap_df.sort_values(["Dataset", "Model", 'Group'], ascending = (False, True, True))
group_heatmap_df = heatmap_df.groupby('Model')

In [None]:
df_Pop = group_heatmap_df.get_group('MostPop').pivot("Dataset", "Group", "UGF")
df_BPR = group_heatmap_df.get_group('BPR').pivot("Dataset", "Group", "UGF")
df_PF = group_heatmap_df.get_group('PF').pivot("Dataset", "Group", "UGF")
df_WMF = group_heatmap_df.get_group('WMF').pivot("Dataset", "Group", "UGF")
df_NeuMF = group_heatmap_df.get_group('NeuMF').pivot("Dataset", "Group", "UGF")
df_VAECF = group_heatmap_df.get_group('VAECF').pivot("Dataset", "Group", "UGF")

f,(ax0,ax1,ax2,ax3,ax4,ax5,axcb) = plt.subplots(1,7, gridspec_kw={'width_ratios':[1,1,1,1,1,1,0.06]}, figsize=(10, 5))

g0 = sns.heatmap(data=df_Pop.reindex(['Org.', 'Fair'], axis=1), annot=True, cmap="YlGnBu", cbar=False, ax=ax0)
g0.set_ylabel('')
g0.set_xlabel('MostPop', fontsize=14)

g1 = sns.heatmap(data=df_BPR.reindex(['Org.', 'Fair'], axis=1),annot=True, cmap="YlGnBu",cbar=False,ax=ax1)
g1.set_ylabel('')
g1.set_xlabel('BPR', fontsize=14)
g1.set_yticks([])

g2 = sns.heatmap(data=df_PF.reindex(['Org.', 'Fair'], axis=1),annot=True, cmap="YlGnBu",cbar=False,ax=ax2)
g2.set_ylabel('')
g2.set_xlabel('PF', fontsize=14)
g2.set_yticks([])

g3 = sns.heatmap(data=df_WMF.reindex(['Org.', 'Fair'], axis=1),annot=True, cmap="YlGnBu",cbar=False,ax=ax3)
g3.set_ylabel('')
g3.set_xlabel('WMF', fontsize=14)
g3.set_yticks([])

g4 = sns.heatmap(data=df_NeuMF.reindex(['Org.', 'Fair'], axis=1),annot=True, cmap="YlGnBu",cbar=False,ax=ax4)
g4.set_ylabel('')
g4.set_xlabel('NeuMF', fontsize=14)
g4.set_yticks([])

g5 = sns.heatmap(data=df_VAECF.reindex(['Org.', 'Fair'], axis=1),annot=True, cmap="YlGnBu",ax=ax5, cbar_ax=axcb)
g5.set_ylabel('')
g5.set_xlabel('VAECF', fontsize=14)
g5.set_yticks([])

# may be needed to rotate the ticklabels correctly:
for ax in [g0,g1,g2,g3,g4,g5]:
    tl = ax.get_xticklabels()
    ax.set_xticklabels(tl, rotation=40)
    tly = ax.get_yticklabels()
    ax.set_yticklabels(tly, rotation=0)


plt.savefig(f'plots/{user_groups}/heatmap_{user_groups}.pdf', bbox_inches='tight')

In [None]:
group_boxplot_df = boxplot_df.groupby('Dataset')

In [None]:
# for dataset in datasets:
#     print(f"Dataset: {dataset}")
#     box_plt = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group(dataset))
#     plt.show()

In [None]:
import seaborn  as sns
import matplotlib.pyplot as plt

# plt.xticks(fontsize=14, rotation=90)

f,(ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8) = plt.subplots(1,8, gridspec_kw={'width_ratios':[1,1,1,1,1,1,1,1]}, figsize=(35, 6))
# ax1.get_shared_y_axes().join(ax2,ax3)

g1 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('MovieLens'), ax=ax1)
g1.set_ylabel('UGF', fontsize=16)
g1.set_xlabel('MovieLens100K', fontsize=16)
g1.set_xticklabels(g1.get_xticklabels(), fontsize=14)
g1.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
g1.set_yticklabels(np.round(g1.get_yticks(), 2), fontsize=16)

g2 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('Epinion'), ax=ax2)
g2.set_ylabel('')
g2.set_xlabel('Epinion', fontsize=16)
g2.set_xticklabels(g2.get_xticklabels(), fontsize=14)
g2.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
g2.set_yticklabels(np.round(g2.get_yticks(), 2), fontsize=2, color='w')
# g2.set_yticks([])

g3 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('BookCrossing'), ax=ax3)
g3.set_ylabel('')
g3.set_xlabel('BookCrossing', fontsize=16)
g3.set_xticklabels(g3.get_xticklabels(), fontsize=14)
g3.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
# g3.set_yticks([])
g3.set_yticklabels(np.round(g3.get_yticks(), 2), fontsize=2, color='w')

g4 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('AmazonOffice'), ax=ax4)
g4.set_ylabel('')
g4.set_xlabel('AmazonOffice', fontsize=16)
g4.set_xticklabels(g4.get_xticklabels(), fontsize=14)
g4.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
# g4.set_yticks([])
g4.set_yticklabels(np.round(g4.get_yticks(), 2), fontsize=2, color='w')

g5 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('AmazonToy'), ax=ax5)
g5.set_ylabel('')
g5.set_xlabel('AmazonToy', fontsize=16)
g5.set_xticklabels(g5.get_xticklabels(), fontsize=14)
g5.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
# g5.set_yticks([])
g5.set_yticklabels(np.round(g5.get_yticks(), 2), fontsize=2, color='w')

g6 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('LastFM'), ax=ax6)
g6.set_ylabel('')
g6.set_xlabel('LastFM', fontsize=16)
g6.set_xticklabels(g6.get_xticklabels(), fontsize=14)
g6.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
# g6.set_yticks([])
g6.set_yticklabels(np.round(g6.get_yticks(), 2), fontsize=2, color='w')

g7 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('Gowalla'), ax=ax7)
g7.set_ylabel('')
g7.set_xlabel('Gowalla', fontsize=16)
g7.set_xticklabels(g7.get_xticklabels(), fontsize=14)
g7.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
# g7.set_yticks([])
g7.set_yticklabels(np.round(g7.get_yticks(), 2), fontsize=2, color='w')

g8 = sns.boxplot(x="Type", y="UGF", data=group_boxplot_df.get_group('Foursquare'), ax=ax8)
g8.set_ylabel('')
g8.set_xlabel('Foursquare', fontsize=16)
g8.set_xticklabels(g8.get_xticklabels(), fontsize=14)
g8.set_ylim(min(boxplot_df.UGF), round(max(boxplot_df.UGF), 2))
# g8.set_yticks([])
g8.set_yticklabels(np.round(g8.get_yticks(), 2), fontsize=2, color='w')

plt.savefig(f'plots/{user_groups}/boxplot_{user_groups}.pdf', bbox_inches='tight')

In [None]:
ax, fig = plt.subplots(figsize=[9,4])
p = sns.boxplot(x="Model", y="ImpPrecent", data=boxplot_df[boxplot_df.Type == 'Fair'])
p.set_yticklabels(np.round(p.get_yticks(), 4), fontsize=14)
p.set_xticklabels(p.get_xticklabels(), fontsize=14)
p.set_ylabel(r'Δ% (UGF)', fontsize=16)
p.set_xlabel('Dataset', fontsize=16)
p.set_xticklabels(p.get_xticklabels(), rotation=30)

plt.savefig(f'plots/{user_groups}/boxplot_ds_{user_groups}.pdf', bbox_inches='tight')

In [None]:
ax, fig = plt.subplots(figsize=[9,4])
p = sns.boxplot(x="Model", y="ImpPrecent", data=boxplot_df[boxplot_df.Type == 'Fair'])
p.set_yticklabels(np.round(p.get_yticks(), 4), fontsize=14)
p.set_xticklabels(p.get_xticklabels(), fontsize=14)
p.set_ylabel(r'Δ% (UGF)', fontsize=16)
p.set_xlabel('Models', fontsize=16)
p.set_xticklabels(p.get_xticklabels(), rotation=30)

plt.savefig(f'plots/{user_groups}/boxplot_ml_{user_groups}.pdf', bbox_inches='tight')

In [None]:
# plots to show the correlation between the improvement of UGF and the other metrics
# ΔGAP (Act. - Inact.)

labels = {'impNDCG': r'Δ% (NDCG)', 'impNov': r'Δ% (Nov)', 'impDeltaGap': r'Δ% (ΔGAP)'}
for metric in ['impNDCG', 'impNov', 'impDeltaGap']:
    fig, ax = plt.subplots()
    fig.set_size_inches(3, 2)
    r = sns.regplot(x="impUGF", y=metric, data=boxplot_df[(boxplot_df.Type == 'Fair')], ax=ax)
    x = boxplot_df[(boxplot_df.Type == 'Fair')]
    v, p = sp.stats.pearsonr(x['impUGF'], x[metric])
    print(round(v, 5) , round(p, 7))
    r.set_yticklabels(np.round(r.get_yticks(), 4), fontsize=10)
    r.set_xticklabels(np.round(r.get_xticks(), 4), fontsize=10)
    
    r.set_ylabel(f"{labels[metric]}", fontsize=10)
    r.set_xlabel(r'Δ% (UGF)', fontsize=10)

    fig.savefig(f'plots/{user_groups}/boxplot_corr_{metric}_{user_groups}.pdf', bbox_inches='tight')

In [None]:
boxplot_df[(boxplot_df.Type == 'Fair')]['impDeltaGap']

In [None]:
a = sns.regplot(x="UGF", y=metric, data=boxplot_df[boxplot_df.Type == 'Fair'])

In [None]:
# user profile dict -> uid: profile size
# active users list
# inactive users list

from collections import defaultdict
from tqdm import tqdm
import numpy as np

def user_interactions(dataset='LastFM', is_implicit=True):
    # loading the Book-Crossing dataset
    train_data = pd.read_csv(f"datasets/{dataset}/{dataset}_train.txt", sep="\t", names=['uid', 'iid', 'count'])
    user_profiles = defaultdict(list)
    for eachline in tqdm(train_data.itertuples(index=True)):
        uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
        uid, iid, count = int(uid), int(iid), int(count)
        if is_implicit:
            for i in range(count):
                if uid in user_profiles.keys():
                    user_profiles[uid].append(iid)
                else:
                    user_profiles[uid] = [iid]
        else:
            if uid in user_profiles.keys():
                user_profiles[uid].append(iid)
            else:
                user_profiles[uid] = [iid]
    return user_profiles

def avg_profile_group(dataset='LastFM', is_implicit=True):
    global user_profile_df
    global user_size_df
    users_profile = user_interactions(dataset=dataset, is_implicit=is_implicit)
    active_users = open(f"datasets/{dataset}/groups/users/{user_groups}/active_ids.txt").readlines()
    active_users = [int(uid.strip()) for uid in active_users]
    inactive_users = open(f"datasets/{dataset}/groups/users/{user_groups}/inactive_ids.txt").readlines()
    inactive_users = [int(uid.strip()) for uid in inactive_users]
    active_user_profile_size = [len(users_profile[uid]) for uid in active_users if uid in users_profile]
    for active_user_size in active_user_profile_size:
        user_size_df = user_size_df.append({'Dataset': dataset, 'group': 'advantaged', 'size': active_user_size}, ignore_index=True)
    inactive_user_profile_size = [len(users_profile[uid]) for uid in inactive_users if uid in users_profile]
    for inactive_user_size in inactive_user_profile_size:
        user_size_df = user_size_df.append({'Dataset': dataset, 'group': 'disadvantaged', 'size': inactive_user_size}, ignore_index=True)
    user_profile_df = user_profile_df.append({'Dataset': dataset, 'group': 'advantaged', 'avg': np.mean(active_user_profile_size)}, ignore_index=True)
    user_profile_df = user_profile_df.append({'Dataset': dataset, 'group': 'disadvantaged', 'avg': np.mean(inactive_user_profile_size)}, ignore_index=True)

In [None]:
user_profile_df = pd.DataFrame(columns=['Dataset', 'group', 'avg'])
user_size_df = pd.DataFrame(columns=['Dataset', 'group', 'size'])
# datasets = ['Epinion-Flase', 'MovieLens-False', 'BookCrossing-False', 'AmazonOffice-False', 'AmazonToy-False', 'Gowalla-Ture', 'LastFM-', 'Foursquare-True']
datasets = ['Epinion-Flase']
for datasetIsimplicit in datasets:
    dataset, Isimplicit = datasetIsimplicit.split('-')
    if Isimplicit == 'True':
        avg_profile_group(dataset=dataset, is_implicit=True)
    else:
        avg_profile_group(dataset=dataset, is_implicit=False)

In [None]:
user_profile_df

In [None]:
ax = sns.barplot(x="Dataset", y="avg", hue="group", data=user_profile_df)

In [None]:
sns.boxplot(x="Dataset", y="size",
            hue="group", data=user_size_df)

In [None]:
import scipy as sp

In [None]:
for dataset in datasets:
    fig = plt.figure()
    sns.regplot(x="ImpPrecent", y='NDCG', data=boxplot_df[(boxplot_df.Type == 'Fair') & (boxplot_df.Dataset == dataset)])
    x = boxplot_df[(boxplot_df.Type == 'Fair') & (boxplot_df.Dataset == dataset)]
    r, p = sp.stats.pearsonr(x['ImpPrecent'], x['NDCG'])
    print(r , p)