In [None]:
### If running this on Google Colab, uncomment and run the following lines:
#!pip install netin
#!wget -nc https://raw.githubusercontent.com/snma-tutorial/www2023/main/exercises/helper.py
#!mkdir plots

# Exercise \#4

Ranking inequalities

## Dependencies

In [None]:
from netin import ...
from netin import viz
from netin import stats

## Helpers

In [None]:
%load_ext autoreload
%autoreload 2

import helper

``` python
# Returns some metadata for each node in a graph as a DataFrame
Graph.get_node_metadata_as_dataframe(self, 
                                     include_graph_metadata=False, 
                                     n_jobs=1) -> pd.DataFrame
```

``` python
# Plots the edge type counts of a single or a list of graphs
helper.plot_edge_type_counts(data: Graph | list[Graph] | set[Graph], **kwargs)
```

``` python
# Updates the name of a graph (by default it is the model name)
helper.update_name_homophily(data: Graph | pd.DataFrame) -> str
```

``` python
# Plots the probability density function
viz.plot_powerlaw_fit(data: pd.DataFrame | list[pd.DataFrame], 
                      col_name: str | list[str], 
                      kind: str,
                      fn: str = None, 
                      **kwargs)
```

``` python
# Plots the inequity of a rank distribution
viz.plot_fraction_of_minority(data: pd.DataFrame | List[pd.DataFrame], 
                              col_name: str | List,
                              fn: str = None, 
                              **kwargs):
```

``` python 
# Plots the inequality of a rank distribution
viz.plot_gini_coefficient(data: pd.DataFrame | List[pd.DataFrame], 
                          col_name: str | List,
                          fn: str = None, 
                          **kwargs):
```

``` python
# Plots the disparity (inequity vs. inequality) of a rank distribution
viz.plot_disparity(data: pd.DataFrame | List[pd.DataFrame], 
                   col_name: str | List, 
                   fn: str = None, 
                   **kwargs):
```

# Directed graphs

## Comparing the effect of homophily in ranking by pagerank
- Create 9 `DPAH` graphs
- Make sure all of them have the same number of nodes `n`, edge density `d`, fraction of minority `f_m`, activities `plo_M=plo_m`, and random seed `seed`.
- Make sure they have different values of homophily `h_MM` and `h_mm` as follows:
- -  Graphs 1-3: `h_MM=0.1` and `h_mm \in {0.1, 0.5, 0.9}`
- -  Graphs 4-6: `h_MM=0.5` and `h_mm \in {0.1, 0.5, 0.9}`
- -  Graphs 7-9: `h_MM=0.9` and `h_mm \in {0.1, 0.5, 0.9}`
- Plot the edge-type counts
- Plot the probability density function of their `pagerank` distributions
- Plot the `inequality` of the `pagerank`
- Plot the `inequity` of the `pagerank`
- Plot the `disparity` of the `pagerank`

### Bonus: Comparing the effect of preferential attachment and homophily in ranking by pagerank
- Generate 3 directed graphs; one for each model: `DPA`, `DH`, and `DPAH`
- Make sure all of them have the same number of nodes `n`, edge density `d`, fraction of minority `f_m`, activities `plo_M=plo_m`, homophily `h_MM` and `h_mm`, and random seed `seed`, if applicable.

### Task 1. Generating graphs

In [None]:
### Fix some parameters to later compare the effects of homophily

n = ...     # number of nodes
d = ...     # number of edges to attach to every source node
            # Hint: Remember that the final number of edges will be: e = d * n (n-1)
f_m = ...   # fraction of minority group
plo_M = ... # powerlaw out_degree exponent of the majority group (activity)
plo_m = ... # powerlaw out_degree exponent of the minority group (activity)
seed = ...  # random seed (reproducibility)

In [None]:
### Graphs

homophily_values = [...]
graphs = []
metadata = []
for h_MM in homophily_values: # homophily within majority nodes
    for h_mm in homophily_values: # homophily within minority nodes

        # generating graph
        g = DPAH(...)
        g.generate()
        
        # updating name to include homophily values
        g.set_model_name(helper.update_name_homophily(...)) 
        graphs.append(g)

        # generating node metadata dataframe
        df = g.get_node_metadata_as_dataframe(include_graph_metadata=True)
        metadata.append(df)


### Task 2. Getting to know the data

In [None]:
### Setting the look & feel

viz.reset_style()
viz.set_paper_style()

In [None]:
### Plotting al graphs at once
### Showing 3 graphs per row

viz.plot_graph(..., 
               nc = 3, 
               cell_size = 2.0,
               wspace = 0.1,
               fn = 'plots/4_all_graphs.pdf')

### Task 3. Plotting edge-type counts

In [None]:
### Plot edge counts for each graph

helper.plot_edge_type_counts(..., 
                             figsize = (12,5),
                             width_bar = 0.08,
                             nc_legend = 3,
                             loc = 'best',
                             fn = 'plots/4_edge_counts.pdf')

### Task 4. Plotting PDFs

In [None]:
### Plot in_degree distribution of the whole graph
### Hint: Check out the dataframe. Which column has the in_degree of the node?

col_name = ...
kind = ...

viz.plot_powerlaw_fit(data = ..,
                      col_name = col_name,
                      kind = kind,
                      sharex = True, 
                      sharey = True,
                      cell_size = (2.5,2.5),
                      wspace = 0.1,
                      loc = 3,
                      nc = 3,
                      fn = f'plots/4_dpah_{col_name}.pdf')

In [None]:
### Plot in_degree distribution of each group
### Hint: Check out the dataframe. Which column has the class of the node?
### M for majority, and m for minority.

hue = 'class_label'

viz.plot_powerlaw_fit(data = ...,
                      col_name = col_name,
                      kind = kind,
                      hue = hue,
                      sharex = True, 
                      sharey = True,
                      cell_size = (2.5,2.5),
                      wspace = 0.1,
                      loc = 1,
                      nc = 3,
                      fontsize = 9,
                      fn = f'plots/4_dpah_{col_name}_groups.pdf')

### Task 5. Plot Inequity

In [None]:
### Plot the inequity of the 'pagerank' distribution (ME: mean error)
### It shows the fraction of minoritiy nodes (y-axis) at each top-k rank (x-axis)
### Then, ME is computed as the difference between the fraction of minority nodes in each top-k 
### and the actual fraction of minorities.

viz.plot_...(..., 
                              col_name=col_name, 
                              sharex=True, sharey=True,
                              cell_size = (2.5,2.5),
                              wspace = 0.1,
                              nc = 3,
                              fn = f'plots/4_dpah_{col_name}_inequality.pdf')

### Task 6. Plot Inequality

In [None]:
### Plot the inequality of the 'pagerank' distribution
### It shows the Gini coefficient in each top-k.
### Also, the global gini refers to the Gini at top-100% 

viz.plot_...(metadata, 
                          col_name = col_name, 
                          sharex = True, sharey = True,
                          nc = 3, 
                          wspace = 0.08, 
                          cell_size = (1.9,2.2),
                          fn = f'plots/4_dpah_{col_name}_inequity.pdf')

### Task 7. Plot Disparity

In [None]:
### Plot the disparity of the 'pagerank' distribution
### It shows the inequity (ME) vs. inequality (Gini)

viz.plot_...(metadata, 
                   col_name = col_name, 
                   sharex = True, sharey = True,
                   nc = 3, 
                   wspace = 0.08, 
                   cell_size = (1.9,2.2),
                   fn = f'plots/4_dpah_{col_name}_disparity.pdf')

# Bonus: Disentangling the effect of PA and H in ranking disparities

In [None]:
### Parameters
h_mm = ...
h_MM = ...

### Graphs
g_dpa = DPA(...)
g_dpa.generate()

g_dh = DH(...)
g_dh.generate()

g_dpah = DPAH(...)
g_dpah.generate()

### Get node metadata
metadata = [g_dh.get_node_metadata_as_dataframe(include_graph_metadata=True),
             g_dpa.get_node_metadata_as_dataframe(include_graph_metadata=True),
             g_dpah.get_node_metadata_as_dataframe(include_graph_metadata=True)]

### Visualize
viz.plot_disparity(..., 
                   col_name = ..., 
                   sharex = True, sharey = True,
                   nc = 3, 
                   wspace = 0.08, 
                   cell_size = (1.9,2.2),
                   fn = f'plots/4_dpa_dh_dpah_{col_name}_disparity.pdf')