# Exercise \#3

Sampling biases

## Dependencies

In [None]:
from netin import *
from netin import viz
from netin import sampling
from netin import stats

## Helpers

In [None]:
%load_ext autoreload
%autoreload 2

import helper

``` python
# Updates the name of a graph (by default it is the model name)
helper.update_name_homophily(data: Graph | pd.DataFrame) -> str
```


``` python
# Plots the original graphs (row=0) and the samples (row>=1)
helper.plot_samples(originals: List[netin.Graph], samples: List[List[netin.Graph]], fn: str = None, **kwargs)
```

``` python
# Loads the Facebook friends graph
load_fb_data(path: str = 'data/fb_friends')
``` 

# Directed networks

## Comparing the effects of homophily  in sampling
- Create 3 `DPAH` graphs
- Make sure all of them have the same number of nodes `n`, edge density `d`, fraction of minority `f_m`, activities `plo_M` and `plo_m`, and random seed `seed`.
- Make sure they have the same level of homophily within the majority group (e.g., `h_MM=0.5`) and vary only the homophily within the minority group, for example:
  -  Graph 1: `h_MM=0.5` and `h_mm=0.1`
  -  Graph 2: `h_MM=0.5` and `h_mm=0.5`
  -  Graph 3: `h_MM=0.5` and `h_mm=0.9`
- Make 6 random samples using the sampling techniques from `netin.sampling.*`
  - Make sure they all have the same `pseeds` (sample size)
- Plot the graphs and the samples.
- Plot the representation of groups for each sample
  - The CDF of the `in_degree` distribution and the CCDF of the `pagerank` distribution. 
  - Which sample looks closest to the full data? Does it depend on h?

### Bonus: Replicate the same results with an empirical network
- Load the `fb_friends` network.
- Get to know the data (`.info()`)
- Make a sample

### Task 1. Generating graphs

In [None]:
### Fix some parameters to later compare the effects of homophily

n = ...     # number of nodes
d = ...   # edge density
f_m = ...   # fraction of minority
plo_M = ... # power-law of out-degree distribution majority 
plo_m = ... # power-law of out-degree distribution minority
h_MM = ...  # homophily majority
seed = ... # random seed (reproducibility)

In [None]:
### Graphs

homophily_values = [...]
graphs = []

for h_mm in homophily_values: # homophily within each class
    # generating graph
    g = DPAH(n=n, d=d, f_m=f_m, h_MM=h_MM, h_mm=h_mm, plo_M=plo_M, plo_m=plo_m, seed=seed)
    g.generate()

    # updating name to include homophily values
    g.set_model_name(helper.update_name_homophily(g)) 
    graphs.append(g)
    

### Task 2. Sampling

In [None]:
### Fix the sample size
sample_size = ...

#### 2.1 Random Nodes

In [None]:
### Genereting subgraphs
### via sampling

samples_nodes = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_nodes.append(gs.sample)
    

#### 2.2 Random Edges

In [None]:
### Genereting subgraphs
### via sampling

samples_edges = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_edges.append(gs.sample)


#### 2.3 Random Neighbor

In [None]:
### Genereting subgraphs
### via sampling

samples_neighbor = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_neighbor.append(gs.sample)
    

#### 2.4 Degree Rank (asc)

In [None]:
samples_degree = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed, order='desc')
    gs.sampling()
    samples_degree.append(gs.sample)
    

#### 2.5 Degree Group Rank (desc)

In [None]:
samples_group = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed, order='desc')
    gs.sampling()
    samples_group.append(gs.sample)
    

#### 2.4 Partial Crawls

In [None]:
samples_crawls = []
for g in graphs:
    gs = sampling.<...>(g=g, pseeds=sample_size, random_seed=seed)
    gs.sampling()
    samples_crawls.append(gs.sample)
    

### Task 3. Visualize

In [None]:
### Setting the look & feel
viz.reset_style()
viz.set_paper_style()

In [None]:
### Plotting al graphs and samples at once
### Showing 3 graphs per row

samples = [samples_nodes, samples_edges, samples_neighbor, samples_degree, samples_group, samples_crawls]
helper.plot_samples(..., 
                    ...,  
                   figsize = (8, 15),
                   edge_width = 0.1,
                   wspace = 0.3,
                   seed=seed,
                   fn = 'plots/3_all_graphs_and_samples.pdf')


### Task 4. Representation

In [None]:
all_graphs = graphs + [s for ss in samples for s in ss]
data = []
for g in all_graphs:
    tmp = g.get_node_metadata_as_dataframe(include_graph_metadata=True, n_jobs=1)
    tmp.name = helper.get_title_graph(g)
    data.append(tmp)

In [None]:
### Plot CDF of in_degree distritbuion

col_name = ...
viz.plot_distribution(data, 
                      col_name=col_name,
                      sharex=False, sharey=True,
                      cell_size=(3,1.8),
                      wspace = 0.15,
                      nc = 3,
                      get_x_y_from_df_fnc=stats.distributions.<...>
                     )

In [None]:
### Plot CCDF of the pagerank distritbuion

col_name = ...
viz.plot_distribution(data, 
                      col_name=col_name,
                      sharex=False, sharey=True,
                      cell_size=(2.8, 1.8),
                      wspace = 0.15,
                      nc = 3,
                      hue='class_label',
                      get_x_y_from_df_fnc=stats.distributions.<...>
                     )

# Bonus 

In [None]:
gfb = helper.<...>()

In [None]:
sample_size = ...
seed = ...
gs = sampling.<...>(g=gfb, pseeds=sample_size, random_seed=seed)
gs.sampling()

In [None]:
viz.plot_graph([gs.g, gs.sample], share_pos=True)