In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx
import networkx as nx
import numpy as np
import sys
import pandas as pd
sys.path.insert(0,'..')
from src.data.data_loader import GraphDataset
import pickle
from src.evaluation.network_split import NetworkSplitShchur

In [2]:
datasets = 'cora citeseer pubmed cora_full twitter webkb'.split()

In [3]:
def load_communities(path):
    with open(path, 'rb') as handle:
        ret = pickle.load(handle)
    return ret

def load_labels(path):
    label = {}
    with open(path, 'r') as handle:
        label = {}
        for line in handle:
            s = line.strip().split()
            label[s[0]] = s[-1]
    return label
def agg(x):
    return len(x.unique())

In [4]:
def calc_uncertainty(df_community,dataset_name,labeled=False,seed=0):
    
    if dataset_name == 'cora':
        df_community.label = df_community.label.apply(lambda x : ''.join([c for c in x if c.isupper()]))
    
    if labeled:
        df_community = df_community[df_community[f'labeled{seed}']]
    communities = df_community.community.unique()
    labels = df_community.label.unique()

    mtx = df_community.pivot_table(index='community', columns='label',values='node',aggfunc=agg).fillna(0) / len(df_community)
    
    def Pmarg(c):
        return len(df_community[df_community.community == c]) / len(df_community)
    
    def Pcond(l,c):
        return mtx.loc[c,l]/Pmarg(c)
    
    H = 0
    for c in communities:
        h = 0
        for l in labels:
            if Pcond(l,c) == 0:
                continue
            h += Pcond(l,c) * np.log2(1./Pcond(l,c))
        H += h * Pmarg(c)
    
    def Pl(l):
        return len(df_community[df_community.label == l]) / len(df_community)
    
    Hl = 0
    for l in labels:
        if Pl(l) == 0:
            continue
        Hl += Pl(l) * np.log2(1./Pl(l))
    
    IG = Hl-H
    return IG/Hl


In [5]:
relIG = {}
isDirected = False
isReversed = False
splits = 20
for dataset_name in datasets:
    print(dataset_name)
    dataset = GraphDataset(f'../data/tmp/{dataset_name}{("_" + directionality) if isDirected else ""}-', dataset_name,
                           f'../data/graphs/processed/{dataset_name}/{dataset_name}.cites',
                           f'../data/graphs/processed/{dataset_name}/{dataset_name}.content',
                           directed=isDirected, reverse=isReversed)
    data = dataset[0]
    community = load_communities(f'../data/community_id_dicts/{dataset_name}/{dataset_name}_louvain.pickle')
    mapping = data.node_name_mapping[0]
    label = load_labels(f'../data/graphs/processed/{dataset_name}/{dataset_name}.content')
    df_community = pd.DataFrame({'dataset':dataset_name, 'node':node, 'community':community[node], 'label':label[node]} for node in community)
    df_community['node_id'] = df_community.node.apply(lambda x:mapping[x])
    for seed in range(splits):
        split = NetworkSplitShchur(dataset, train_examples_per_class=20,early_examples_per_class=0,
             val_examples_per_class=30, split_seed=seed)
        df_community[f'labeled{seed}'] = df_community.node_id.apply(lambda x: (split.train_mask[x]).numpy())
    
    ulc = [calc_uncertainty(df_community, dataset_name, True, seed) for seed in range(splits)]
    relIG[dataset_name] = ulc

cora
citeseer
pubmed
cora_full
twitter
webkb


In [14]:
for dataset in datasets:
    print(f'{dataset}: U(L|C)={np.mean(relIG[dataset]):.3f}' + u"\u00B1" + f'{np.std(relIG[dataset]):.2f}')

cora: U(L|C)=0.691±0.03
citeseer: U(L|C)=0.647±0.04
pubmed: U(L|C)=0.673±0.09
cora_full: U(L|C)=0.479±0.01
twitter: U(L|C)=0.538±0.13
webkb: U(L|C)=0.320±0.07
