# Wikivitals datasets

In [1]:
import os
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objs as go

In [30]:
def load_dict(path: str, filename: str) -> dict:
    """Load dictionary."""
    with open(f'{os.path.join(path, filename)}', 'rb') as f:
        data = pickle.load(f)
    return data

def data2df(path: str, **kwargs):
    """Load data and save it to DataFrame."""
    # Initialize DataFrame
    df = pd.DataFrame(columns=['dataset', 'model', 'k', 'train_acc', 'test_acc', 'computation_time'])
    n_params = len(kwargs.keys())

    print(kwargs.items())
    # Load data
    for i, file in enumerate(os.listdir(path)):
        cpt = 0
        for key_elem, value in kwargs.items():
            if str(key_elem) == 'use_features' and str(value).lower() == 'false':
                if 'use_featurestrue' in file:
                    cpt += 1
            elif (str(key_elem) + str(value).lower()) not in file:
                cpt += 1
        
        if cpt == 0:
            # Load data
            data = load_dict(path, file)
            k = data.get('meta').k
            try:
                if data.get('meta').embedding_method is not None:
                    model = data.get('meta').model + '_embedding_method' + data.get('meta').embedding_method
                else:
                    model = data.get('meta').model
            except:
                model = data.get('meta').model
            mydf = pd.DataFrame({'dataset': [data.get('meta').dataset] * k,
                        'model': [model] * k,
                        'k': [k] * k,
                        'train_size': [1 - (2 * (1 / k))] * k,
                        'test_size': [2 * (1 / k)] * k, # test + val
                        'train_acc': data.get('results').get('train acc'),
                        'test_acc': data.get('results').get('test acc'),
                        'computation_time': data.get('results').get('elapsed_time')})
            df = pd.concat([df, mydf.sort_values('test_size')], ignore_index=True)
        
    return df

# From StackOverflow
def line(error_y_mode=None, **kwargs):
    """Extension of `plotly.express.line` to use error bands."""
    ERROR_MODES = {'bar','band','bars','bands',None}
    if error_y_mode not in ERROR_MODES:
        raise ValueError(f"'error_y_mode' must be one of {ERROR_MODES}, received {repr(error_y_mode)}.")
    if error_y_mode in {'bar','bars',None}:
        fig = px.line(**kwargs)
    elif error_y_mode in {'band','bands'}:
        if 'error_y' not in kwargs:
            raise ValueError(f"If you provide argument 'error_y_mode' you must also provide 'error_y'.")
        figure_with_error_bars = px.line(**kwargs)
        fig = px.line(**{arg: val for arg,val in kwargs.items() if arg != 'error_y'})
        for data in figure_with_error_bars.data:
            x = list(data['x'])
            y_upper = list(data['y'] + data['error_y']['array'])
            y_lower = list(data['y'] - data['error_y']['array'] if data['error_y']['arrayminus'] is None else data['y'] - data['error_y']['arrayminus'])
            color = f"rgba({tuple(int(data['line']['color'].lstrip('#')[i:i+2], 16) for i in (0, 2, 4))},.3)".replace('((','(').replace('),',',').replace(' ','')
            fig.add_trace(
                go.Scatter(
                    x = x+x[::-1],
                    y = y_upper+y_lower[::-1],
                    fill = 'toself',
                    fillcolor = color,
                    line = dict(
                        color = 'rgba(255,255,255,0)'
                    ),
                    hoverinfo = "skip",
                    showlegend = False,
                    legendgroup = data['legendgroup'],
                    xaxis = data['xaxis'],
                    yaxis = data['yaxis'],
                )
            )
        # Reorder data as said here: https://stackoverflow.com/a/66854398/8849755
        reordered_data = []
        for i in range(int(len(fig.data)/2)):
            reordered_data.append(fig.data[i+int(len(fig.data)/2)])
            reordered_data.append(fig.data[i])
        fig.data = tuple(reordered_data)
    return fig

## 1. Graph structure only for baselines

In [26]:
RUNPATH = os.path.join(os.path.dirname(os.getcwd()), 'runs')

# Store data into DataFrame
df = data2df(RUNPATH, undirected=False, penalized=True, use_features=False)
print(df.shape)

dict_items([('undirected', False), ('penalized', True), ('use_features', False)])
(959, 8)


In [27]:
df[(df['dataset']=='wikivitals') & (df['model']=='Diffusion') & (df['k']==3)]

Unnamed: 0,dataset,model,k,train_acc,test_acc,computation_time,train_size,test_size
729,wikivitals,Diffusion,3,1.0,0.683099,0.067102,0.333333,0.666667
730,wikivitals,Diffusion,3,1.0,0.697782,0.066515,0.333333,0.666667
731,wikivitals,Diffusion,3,1.0,0.693587,0.066292,0.333333,0.666667


In [28]:
df.dataset.unique(), df.model.unique()

(array(['wikivitals-fr', 'wikivitals', 'wikischools'], dtype=object),
 array(['KNN_embedding_methodtrue', 'LabelPropagation', 'GAT', 'KNN',
        'Diffusion', 'PageRank', 'SGC', 'GCN', 'GraphSage'], dtype=object))

`Wikivitals`

In [33]:
# Compute average and standard deviation according to values of k
grouped = df.groupby(['dataset', 'model', 'k', 'train_size', 'test_size'])[['train_acc', 'test_acc']].agg({'train_acc': ['mean', 'std'], 'test_acc': ['mean', 'std']}).reset_index()
grouped.columns = ['dataset', 'model', 'k', 'train_size', 'test_size', 'train_acc_avg', 'train_acc_std', 'test_acc_avg', 'test_acc_std', ]
grouped

# Test split
for error_y_mode in {'band'}:
    fig = line(
        data_frame = grouped[grouped['dataset']=='wikivitals'],
        x = 'test_size',
        y = 'test_acc_avg',
        error_y = 'test_acc_std',
        error_y_mode = error_y_mode,
        color = 'model',
        title = f'Test accuracy',
        markers = '.',
    )
    fig.show()

`Wikivitals-fr`

In [35]:
# Compute average and standard deviation according to values of k
grouped = df.groupby(['dataset', 'model', 'k', 'train_size', 'test_size'])[['train_acc', 'test_acc']].agg({'train_acc': ['mean', 'std'], 'test_acc': ['mean', 'std']}).reset_index()
grouped.columns = ['dataset', 'model', 'k', 'train_size', 'test_size', 'train_acc_avg', 'train_acc_std', 'test_acc_avg', 'test_acc_std', ]
grouped

# Test split
for error_y_mode in {'band'}:
    fig = line(
        data_frame = grouped[grouped['dataset']=='wikivitals-fr'],
        x = 'test_size',
        y = 'test_acc_avg',
        error_y = 'test_acc_std',
        error_y_mode = error_y_mode,
        color = 'model',
        title = f'Test accuracy',
        markers = '.',
    )
    fig.show()

`Wikischools`

In [37]:
# Compute average and standard deviation according to values of k
grouped = df.groupby(['dataset', 'model', 'k', 'train_size', 'test_size'])[['train_acc', 'test_acc']].agg({'train_acc': ['mean', 'std'], 'test_acc': ['mean', 'std']}).reset_index()
grouped.columns = ['dataset', 'model', 'k', 'train_size', 'test_size', 'train_acc_avg', 'train_acc_std', 'test_acc_avg', 'test_acc_std', ]
grouped

# Test split
for error_y_mode in {'band'}:
    fig = line(
        data_frame = grouped[grouped['dataset']=='wikischools'],
        x = 'test_size',
        y = 'test_acc_avg',
        error_y = 'test_acc_std',
        error_y_mode = error_y_mode,
        color = 'model',
        title = f'Test accuracy',
        markers = '.',
    )
    fig.show()

In [38]:
df[(df['dataset']=='wikischools') & (df['model']=='SGC')]

Unnamed: 0,dataset,model,k,train_acc,test_acc,computation_time,train_size,test_size
573,wikischools,SGC,9,0.665693,0.62002,7.033724,0.777778,0.222222
574,wikischools,SGC,9,0.706301,0.632653,6.843435,0.777778,0.222222
575,wikischools,SGC,9,0.647975,0.597208,7.037248,0.777778,0.222222
576,wikischools,SGC,9,0.663358,0.612815,6.818525,0.777778,0.222222
577,wikischools,SGC,9,0.694891,0.640082,6.819979,0.777778,0.222222
578,wikischools,SGC,9,0.653236,0.59032,6.848935,0.777778,0.222222
579,wikischools,SGC,9,0.685061,0.620995,6.868691,0.777778,0.222222
580,wikischools,SGC,9,0.663844,0.575324,6.728025,0.777778,0.222222
581,wikischools,SGC,9,0.715036,0.629175,6.756609,0.777778,0.222222


## 2. Features for baselines

In [42]:
RUNPATH = os.path.join(os.path.dirname(os.getcwd()), 'runs')

# Store data into DataFrame
df = data2df(RUNPATH, undirected=False, penalized=True, use_features=True)

# Load performances with GNN (no need to use the use_features parameter)
df_gnns = data2df(RUNPATH, undirected=False, penalized=True, use_features=False)
df_gnns = df_gnns[(df_gnns['model'] == 'GCN') | (df_gnns['model'] == 'GAT') | (df_gnns['model'] == 'GraphSage') | (df_gnns['model'] == 'SGC')]

print(df.shape, df_gnns.shape)

dict_items([('undirected', False), ('penalized', True), ('use_features', True)])
dict_items([('undirected', False), ('penalized', True), ('use_features', False)])
(780, 8) (188, 8)


In [43]:
df.model.unique(), df_gnns.model.unique()

(array(['KNN', 'PageRank', 'KNN_embedding_methodtrue', 'Diffusion',
        'LabelPropagation'], dtype=object),
 array(['GAT', 'GraphSage', 'SGC', 'GCN'], dtype=object))

In [45]:
# Concatenate results for baselines and GNNs
df_tot = pd.concat([df, df_gnns])

In [47]:
df_tot[(df_tot['dataset']=='wikivitals') & (df_tot['k']==3)]

Unnamed: 0,dataset,model,k,train_acc,test_acc,computation_time,train_size,test_size
227,wikivitals,LabelPropagation,3,1.0,0.198981,0.206391,0.333333,0.666667
228,wikivitals,LabelPropagation,3,1.0,0.198831,0.203279,0.333333,0.666667
229,wikivitals,LabelPropagation,3,1.0,0.198831,0.197615,0.333333,0.666667
257,wikivitals,PageRank,3,1.0,0.78184,2.071895,0.333333,0.666667
258,wikivitals,PageRank,3,1.0,0.779293,2.055171,0.333333,0.666667
259,wikivitals,PageRank,3,1.0,0.78199,2.070127,0.333333,0.666667
592,wikivitals,KNN_embedding_methodtrue,3,1.0,0.813156,10.592218,0.333333,0.666667
593,wikivitals,KNN_embedding_methodtrue,3,1.0,0.810009,10.646292,0.333333,0.666667
594,wikivitals,KNN_embedding_methodtrue,3,1.0,0.814804,10.663012,0.333333,0.666667
653,wikivitals,KNN,3,1.0,0.596794,45.833864,0.333333,0.666667


`Wikivitals`

In [48]:
# Compute average and standard deviation according to values of k
grouped = df_tot.groupby(['dataset', 'model', 'k', 'train_size', 'test_size'])[['train_acc', 'test_acc']].agg({'train_acc': ['mean', 'std'], 'test_acc': ['mean', 'std']}).reset_index()
grouped.columns = ['dataset', 'model', 'k', 'train_size', 'test_size', 'train_acc_avg', 'train_acc_std', 'test_acc_avg', 'test_acc_std', ]
grouped

# Test split
for error_y_mode in {'band'}:
    fig = line(
        data_frame = grouped[grouped['dataset']=='wikivitals'],
        x = 'test_size',
        y = 'test_acc_avg',
        error_y = 'test_acc_std',
        error_y_mode = error_y_mode,
        color = 'model',
        title = f'Test accuracy',
        markers = '.',
    )
    fig.show()

`Wikivitals-fr`

In [49]:
# Compute average and standard deviation according to values of k
grouped = df_tot.groupby(['dataset', 'model', 'k', 'train_size', 'test_size'])[['train_acc', 'test_acc']].agg({'train_acc': ['mean', 'std'], 'test_acc': ['mean', 'std']}).reset_index()
grouped.columns = ['dataset', 'model', 'k', 'train_size', 'test_size', 'train_acc_avg', 'train_acc_std', 'test_acc_avg', 'test_acc_std', ]
grouped

# Test split
for error_y_mode in {'band'}:
    fig = line(
        data_frame = grouped[grouped['dataset']=='wikivitals-fr'],
        x = 'test_size',
        y = 'test_acc_avg',
        error_y = 'test_acc_std',
        error_y_mode = error_y_mode,
        color = 'model',
        title = f'Test accuracy',
        markers = '.',
    )
    fig.show()

`Wikischools`

In [50]:
# Compute average and standard deviation according to values of k
grouped = df_tot.groupby(['dataset', 'model', 'k', 'train_size', 'test_size'])[['train_acc', 'test_acc']].agg({'train_acc': ['mean', 'std'], 'test_acc': ['mean', 'std']}).reset_index()
grouped.columns = ['dataset', 'model', 'k', 'train_size', 'test_size', 'train_acc_avg', 'train_acc_std', 'test_acc_avg', 'test_acc_std', ]
grouped

# Test split
for error_y_mode in {'band'}:
    fig = line(
        data_frame = grouped[grouped['dataset']=='wikischools'],
        x = 'test_size',
        y = 'test_acc_avg',
        error_y = 'test_acc_std',
        error_y_mode = error_y_mode,
        color = 'model',
        title = f'Test accuracy',
        markers = '.',
    )
    fig.show()