# Assignment

In this assignment, we will use an airport network. While the airport network is directed, we ignore its directionality and use it its undirected citation networks.

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse

node_table = pd.read_csv(
    "https://raw.githubusercontent.com/skojaku/adv-net-sci-course/main/data/airport_network_v2/node_table.csv"
)
edge_table = pd.read_csv(
    "https://raw.githubusercontent.com/skojaku/adv-net-sci-course/main/data/airport_network_v2/edge_table.csv",
    dtype={"src": np.int32, "trg": np.int32},
)
src, trg = tuple(edge_table[["src", "trg"]].values.T)

rows, cols = src, trg
nrows, ncols = node_table.shape[0], node_table.shape[0]
A = sparse.csr_matrix(
    (np.ones_like(rows), (rows, cols)),
    shape=(nrows, ncols),
).asfptype()

# Symmterize and binarize
A = A + A.T
A.data = A.data * 0 + 1

---

**Question 1: Implement the following function to compute the eigenvector centrality. Then, find the top 10 journals with the highest centrality**
Hint:
- Use `scipy.sparse.linalg.eigs` to compute the eigenvector. The returned vector might be a complex number. In this case, take its real part by using `np.real`.
- The sign of an eigenvector is not fixed; it is indefinite, meaning that if a matrix $X$ has an eigenvector $v$, then $-v$ is also an eigenvector of $X$ with the same eigenvalue. Since eigencentrality should always be non-negative, flip the sign of the eigenvector if necessary.

In [None]:
def compute_eigencentrality(A):
    """
    Compute the eigenvector centrality of a network.

    Parameters
    ----------
    A : scipy.sparse.csr_matrix
      Adjacency matrix of the network.

    Returns
    -------
    ec : numpy.ndarray
      Eigenvector centrality of the network.
    """
    # Your code here
    return ec

In [None]:
# Test
def test_compute_eigencentrality():
    ec = compute_eigencentrality(A)
    assert len(ec) == A.shape[0]
    assert np.all(ec >= 0)
    assert ~np.any(np.iscomplex(ec))


test_compute_eigencentrality()

In [None]:
ec = compute_eigencentrality(A)
top_node_ids = np.argsort(ec)[::-1][:10]
node_table.query("node_id in @top_node_ids")

---
**Question 2: Implement the following function to embed a network by using the spectral embedding of the adjacency matrix. The embedding vectors should be associated with the largest eigenvalues in magnitude. Then, draw the embedding based on the second and third principal eigenvectors. Color the nodes by the `region` of the node.**

In [None]:
def spectral_embedding(A, dim):
    """Spectral embedding based on the adjacency matrix.

    Parameters
    ----------
    A : scipy.sparse.csr_matrix
      Adjacency matrix of the network.
    dim : int
      Dimension of the embedding.

    Returns
    -------
    X : numpy.ndarray
      Spectral embedding. The shape is (n_nodes, dim).
    lams: numpy.ndarray
      Eigenvalues associated with each column of X. The shape is (dim,).
    """
    return ...

In [None]:
# Draw the plot

---
**Question 3: Implement the following function that computes the reconstruction errors for dimensions k = 1, 2, ..., 300. Draw a line plot for the reconstruction error as a function of the dimensions.**
Hint
- If $A$ is symmetric, $A$ can be approximately reconstructed by $A\sim U \Lambda U^\top$, where $U$ is $N\times K$ matrix for a network of $N$ nodes and $K$ dimensional embedding.
- But, constructing $A$ for every $k$ is time-consuming. You might want to leverage the relationships between the eigenvalues and reconstruction errors.

In [None]:
def compute_reconstruction_error(A, kmax=300):
    """Compute reconstruction error for dimensions k = 1, 2, ..., kmax

    Parameter:
    -----------
    A: scipy.sparse.csr_matrix
      Adjacency matrix
    kmax: int
      Maximum dimension

    Returns:
    --------
    error: np.ndarray
      Reconstruction error for k = 1, 2, ..., kmax
    """
    return ...


def test_compute_reconstruction_error():
    error = compute_reconstruction_error(A, kmax=30)
    assert error.shape[0] == 30, "error should have the size of kmax"
    assert np.all(error >= 0), "error should be positive"
    assert np.all(error[:-1] >= error[1:]), "error should be decreasing"
    w, _ = sparse.linalg.eigs(A, k=30)
    assert np.isclose(
        np.sort(w * w)[::-1][1:] + np.diff(error), 0, atol=1e-2
    ).all(), "error difference should be the same as the squared eigenvalues."


# Code test
test_compute_reconstruction_error()

In [None]:
# And draw the plot

---
**Question 4: Suppose a classification problem in which we want to classify airports into regions based on the network structure. Construct a classifier that takes a network embedding of airports and predicts its region.**

First, we will use 10% of the data for training and 90% of the data for evaluating the performance.
(notice that we only know a handful of regional 'labels' and want to predict the region of most airports!!!!)

In [None]:
# Split the node table into the train and test set.
df = node_table.sample(frac=1, random_state=0)
train_node_table = df.iloc[: int(len(df) * 0.1)]
test_node_table = df.iloc[int(len(df) * 0.1) :]

Then, generate the embedding.

In [None]:
emb, lams = spectral_embedding(A, dim=30)
emb = np.real(emb)

We will evaluate the classification performance by the accuracy:

In [None]:
def eval_prediction_accuracy(y, yred):
    """Calculate prediction accuracy.

    Parameters
    ----------
    y : numpy.ndarray
      True labels.
    ypred : numpy.ndarray
      Predicted labels.

    Returns
    -------
    acc : float
      Prediction accuracy.
    """
    return np.sum(y == yred) / len(y)

Implement the functions `fit` and `predict` in the following Python class.

In [None]:
class MyClassifier:
    """A model for the node classification task.
    You can use any classifier you want. For instance,
    - sklearn.discriminant_analysis.LinearDiscriminantAnalysis,
    - sklearn.svm.SVC,
    - sklearn.linear_model.LogisticRegression,
    - sklearn.ensemble.RandomForestClassifier, etc.
    """

    def __init__(self):
        pass

    def fit(self, X, y):
        """Fit the classifier.

        Parameters
        ----------
        X : numpy.ndarray
          The shape is (n_train_nodes, dim).
        y : numpy.ndarray
          The shape is (n_train_nodes,).
        """
        pass

    def predict(self, X):
        """Predict the class of nodes.

        Parameters
        ----------
        X : numpy.ndarray
          The shape is (n_test_nodes, dim).

        Returns
        -------
        y : numpy.ndarray
          The shape is (n_test_nodes,).
        """
        pass

In [None]:
# Train
clf = MyClassifier()
clf.fit(emb[train_node_table["node_id"].values], train_node_table["region"].values)

# Predict
ypred = clf.predict(emb[test_node_table["node_id"].values])

# Evaluation
accuracy = eval_prediction_accuracy(predicted_region, test_node_table["region"].values)

print(f"Accuracy: {accuracy:.3f}")

---
**Question 5: Draw a line plot for the accuracy as a function of the embedding dimension, with the maximum dimension being 100.**

---
**Question 6: Why did the accuracy not improve as the embedding dimension increases? Answer in less than 10 words.**

---
**Question 7: Implement the spectral embedding based on the normalized Lacplain. Do not include the trivial eigenvector associated with the eigenvalue of zero.**

In [None]:
def spectral_embedding_normalized_laplacian(A, dim):
    """Perform the spectral embedding based on the normalized Laplacian matrix.

    Parameters
    ----------
    A : scipy.sparse.csr_matrix
      The adjacency matrix of the network.
    dim : int
      Embedding dimension.

    Returns
    -------
    X : numpy.ndarray
      Spectral embedding. The shape is (n_nodes, dim).
    """
    # Your code here
    pass

In [None]:
# Evaluate your classifier.
emb, lams = spectral_embedding_normalized_laplacian(A, dim=100)

clf = MyClassifier()
clf.fit(emb[train_node_table["node_id"].values], train_node_table["region"].values)
ypred = clf.predict(emb[test_node_table["node_id"].values])

ypred = clf.predict(emb[test_node_table["node_id"].values])
accuracy = eval_prediction_accuracy(ypred, test_node_table["region"].values)
print(f"Accuracy: {accuracy:.3f}")