In [None]:
### If running this on Google Colab, uncomment and run the following lines:
#!pip install netin
#!wget -nc https://raw.githubusercontent.com/snma-tutorial/www2023/main/exercises/helper.py
#!mkdir plots

# Exercise \#3

Sampling biases

## Dependencies

In [None]:
import netin
from netin import *
from netin import viz
from netin import sampling
from netin import stats

## Helpers

In [None]:
%load_ext autoreload
%autoreload 2

import helper

``` python
# Updates the name of a graph (by default it is the model name)
helper.update_name_homophily(data: Graph | pd.DataFrame) -> str
```


``` python
# Plots the original graphs (row=0) and the samples (row>=1)
helper.plot_samples(originals: List[netin.Graph], samples: List[List[netin.Graph]], fn: str = None, **kwargs)
```

``` python
# Loads the Facebook friends graph
helper.load_fb_data(path: str = 'data/fb_friends')
``` 

# Directed networks

## Comparing the effects of homophily  in sampling
- Create 3 `DPAH` graphs
- Make sure all of them have the same number of nodes `n`, edge density `d`, fraction of minority `f_m`, activities `plo_M` and `plo_m`, and random seed `seed`.
- Make sure they have the same level of homophily within the majority group (e.g., `h_MM=0.5`) and vary only the homophily within the minority group, for example:
  -  Graph 1: `h_MM=0.5` and `h_mm=0.1`
  -  Graph 2: `h_MM=0.5` and `h_mm=0.5`
  -  Graph 3: `h_MM=0.5` and `h_mm=0.9`
- Make 6 random samples using the sampling techniques from `netin.sampling.*`
  - Make sure they all have the same `pseeds` (sample size)
- Plot the graphs and the samples.
- Plot the representation of groups for each sample
  - The CDF of the `in_degree` distribution and the CCDF of the `pagerank` distribution. 
  - Which sample looks closest to the full data? Does it depend on h?

## Bonus: Fit the undirected models to a given real network
- Load the `fb_friends` network, and make sure it is a `netin` graph.
- Get to know the data (`.info()`)
- Fit the `PA`, `PAH`, and the `PATC` models to the graph.
- Visualize the graphs.
- Compare their `degree` and `pagerank` distributions (plot the `pdf` and `cdf`).
- Choose four sampling techniques and extract a sample for each network using the same sample size ``pseeds=0.2``. 
  - What properties were preserved? ``f_m``? ``similarity``?

### Task 1. Generating graphs

In [None]:
### Fix some parameters to later compare the effects of homophily

n = ...     # number of nodes
d = ...   # edge density
f_m = ...   # fraction of minority
plo_M = ... # power-law of out-degree distribution majority 
plo_m = ... # power-law of out-degree distribution minority
h_MM = ...  # homophily majority
seed = ... # random seed (reproducibility)

In [None]:
### Graphs

homophily_values = [...]
graphs = []

for h_mm in homophily_values: # homophily within each class
    # generating graph
    g = DPAH(...)
    g.generate()

    # updating name to include homophily values
    g.set_model_name(helper.update_name_homophily(g)) 
    graphs.append(g)
    

### Task 2. Sampling

In [None]:
### Fix the sample size
sample_size = 0.1

#### 2.1 Random Nodes

In [None]:
### Genereting subgraphs
### via sampling

samples_nodes = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=..., random_seed=seed)
    gs.sampling()
    samples_nodes.append(gs.sample)
    

#### 2.2 Random Edges

In [None]:
### Genereting subgraphs
### via sampling

samples_edges = []
for g in graphs:
    gs = sampling.<...>(..., pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_edges.append(gs.sample)


#### 2.3 Random Neighbor

In [None]:
### Genereting subgraphs
### via sampling

samples_neighbor = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_neighbor.append(gs.sample)
    

#### 2.4 Degree Rank (asc)

In [None]:
samples_degree = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed, order='desc')
    gs.sampling()
    samples_degree.append(gs.sample)
    

#### 2.5 Degree Group Rank (desc)

In [None]:
samples_group = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed, order='desc')
    gs.sampling()
    samples_group.append(gs.sample)
    

#### 2.4 Partial Crawls

In [None]:
samples_crawls = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_crawls.append(gs.sample)
    

### Task 3. Visualize

In [None]:
### Setting the look & feel
viz.reset_style()
viz.set_paper_style()

In [None]:
### Plotting al graphs and samples at once
### Showing 3 graphs per row

samples = [samples_nodes, samples_edges, samples_neighbor, samples_degree, samples_group, samples_crawls]
helper.plot_samples(..., 
                    ...,  
                   figsize = (8, 15),
                   edge_width = 0.1,
                   wspace = 0.3,
                   seed=seed,
                   fn = 'plots/3_all_graphs_and_samples.pdf')


### Task 4. Representation

In [None]:
all_graphs = graphs + [s for ss in samples for s in ss]
data = []
for g in all_graphs:
    tmp = g.get_node_metadata_as_dataframe(include_graph_metadata=True, n_jobs=1)
    tmp.name = helper.get_title_graph(g)
    data.append(tmp)

In [None]:
### Plot the CDF of the in_degree

col_name = ...
viz.plot_distribution(..., 
                      col_name=col_name,
                      sharex=False, sharey=True,
                      cell_size=(3,1.8),
                      wspace = 0.15,
                      nc = 3,
                      get_x_y_from_df_fnc=stats.distributions.get_<...>
                     )

In [None]:
### Plot the CCDF of the pagerank for each class

col_name = ...
hue = ...
viz.plot_distribution(data, 
                      col_name=col_name,
                      sharex=False, sharey=True,
                      cell_size=(2.8, 1.8),
                      wspace = 0.15,
                      nc = 3,
                      hue=hue,
                      get_x_y_from_df_fnc=stats.distributions.get_<...>
                     )

# Bonus: Model fitting

In [None]:
### First download the data
### Source:
### Sapiezynski, Piotr; Stopczynski, Arkadiusz; Lassen, David Dreyer; Jørgensen, Sune Lehmann (2019): 
### The Copenhagen Networks Study interaction data. figshare. Dataset. 
### https://doi.org/10.6084/m9.figshare.7267433.v1

!mkdir -p data/fb_friends/ 
!wget -nc https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/13389839/fb_friends.README -P data/fb_friends/
!wget -nc https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/13389320/fb_friends.csv -P data/fb_friends/
!wget -nc https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/13389440/genders.csv -P data/fb_friends/

### Task B1. Load a real-world network

In [None]:
### Load the graph as a `networkx` undirected graph

fb_g_nx = helper.load_<...>()

In [None]:
### Convert the `networkx` undirected graph to a `netin` undirected graph

fb_g = netin.convert_networkx_to_netin(.., name="fb_friends", class_attribute=...)

### Task B2. Fit the directed models to this network

In [None]:
### Fit the PA, PAH, and PATC models to the graph g

g_pa = PA.fit(fb_g, k=2)
g_pah = PAH.fit(fb_g, k=2)
g_patc = PATC.fit(fb_g, k=2)

In [None]:
### Showing the basic info of the real data

fb_g.info()

In [None]:
### Showing the basic info of the real data

g_pa.info()

In [None]:
### Showing the basic info of the fitted PAH graph

g_pah.info()

In [None]:
### Showing the basic info of the fitted PATC graph

g_patc.info()

### Task B3. Visualize them

In [None]:
### Plot the original real graph and their fitted synthetic versions

fb_graphs = [...]
viz.plot_graph(fb_graphs, 
               cell_size=2.1,
               wspace=0.05,
               share_pos=False)

### Task B4. Compare their (node) distributions

In [None]:
fb_metadata = [fb_g.get_node_metadata_as_dataframe(), 
               g_pa.get_node_metadata_as_dataframe(),
               g_pah.get_node_metadata_as_dataframe(),
               g_patc.get_node_metadata_as_dataframe()]

In [None]:
### Plot powerlaw fit of degree (PDF)

col_name = ...
kind = ...
viz.plot_powerlaw_fit(fb_metadata, 
                      col_name=col_name, kind=kind, 
                      sharey=True, cell_size=2.5, wspace=0.1)

In [None]:
### Plot powerlaw fit of degree (CDF) for each group

col_name = ...
kind = ...
hue = ...
viz.plot_powerlaw_fit(fb_metadata, 
                      col_name=col_name, kind=kind, ...=hue
                      sharey=True, cell_size=2.5, wspace=0.1)

In [None]:
### Plot powerlaw fit of pagerank (PDF)

col_name = ...
kind = ...
viz.plot_powerlaw_fit(fb_metadata, 
                      col_name=col_name, kind=kind, 
                      sharey=True, cell_size=2.5, wspace=0.1)

In [None]:
### Plot powerlaw fit of pagerank (CDF) for each group

col_name = ...
kind = ...
hue = ...
viz.plot_powerlaw_fit(fb_metadata, 
                      col_name=col_name, kind=kind, ...=hue
                      sharey=True, cell_size=2.5, wspace=0.1)

### Task B5. Compare their samples

In [None]:
### Ignore user warning about "Graph contains more than two classes"

import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
### Create a random sample for each graph and sampling method

fb_samples = []
sample_size = 0.2

for sampling_method in [sampling.<...>, sampling.<...>, sampling.<...>, sampling.<...>]:
    samples = []
    for g in fb_graphs:
        tmp = sampling_method(...)
        tmp.sampling()
        samples.append(tmp.sample)
    fb_samples.append(samples)
    

In [None]:
### Plot the original networks and their samples

helper.plot_samples(fb_graphs, fb_samples,  
                   figsize = (11, 10),
                   edge_width = 0.1,
                   wspace = 0.3,
                   seed=seed,
                   fn = 'plots/3_fb_friends_and_samples.pdf')
