# Network Analysis


### Import libraries

In [None]:
import networkx as nx
import pandas as pd
import json
import matplotlib.pyplot as plt
import random
import numpy as np

## Individual infos on ego-nets

### Load graphs

In [None]:
# Specify the path to JSON file
from google.colab import drive
drive.mount('/content/drive')

json_file_path = '/content/drive/MyDrive/Colab Notebooks/Network machine learning/Assignments/Project/Datasets/twitch_edges.json'

# Open the JSON file and load its contents into the `data` variable
with open(json_file_path, 'r') as file:
    data = json.load(file)


In [None]:
ego_graphs = {}
for ego_node, edges in data.items():
    graph = nx.Graph()
    graph.add_edges_from(edges)
    ego_graphs[ego_node] = graph

### Load labels

In [None]:
labels = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/Network machine learning/Assignments/Project/Datasets/twitch_target.csv')
print(labels)

### Separate indexes based on labels

In [None]:
index0=labels.index[labels['target'] == 0].tolist()
index1=labels.index[labels['target'] == 1].tolist()

In [None]:
i=0
for ego_node, graph in ego_graphs.items():
    print(f"Ego-Network: {ego_node}")
    print(f"Number of nodes: {graph.number_of_nodes()}")
    print(f"Number of edges: {graph.number_of_edges()}")
    #print(f"Average degree: {sum(graph.degree()) / graph.number_of_nodes()}")
    print()
    i+=1
print(i)

### Visualize graphs with different labels

In [None]:
#select two random graphs with labels 0 and 1 respectively
id0 = random.choice(index0)
id1 = random.choice(index1)
graph0 = ego_graphs[str(id0)]
graph1 = ego_graphs[str(id1)]
# verify that both graphs have the correct label
print('label of first graph:',labels.loc[id0,'target'])
print('label of second graph:',labels.loc[id1,'target'])

# Visualize the graphs
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
plt.subplot(121)
nx.draw(graph0, with_labels=True, node_size=200, node_color='lightblue', edge_color='gray')
axes[0].set_title('Graph with label 0')
plt.subplot(122)
nx.draw(graph1, with_labels=True, node_size=200, node_color='lightblue', edge_color='gray')
axes[1].set_title('Graph with label 1')
plt.show()

In [None]:
graph0.number_of_nodes()

In [None]:
d = {'Label': [0,1], 'min nodes': [1e5, 1e5], 'max nodes': [0, 0], 'average nodes': [0, 0],
     'min edges': [1e5, 1e5], 'max edges': [0, 0], 'average edges': [0, 0],
     'min sparsity': [1e5, 1e5], 'max sparsity': [0, 0], 'average sparsity': [0, 0],
     'min diameter': [1e5, 1e5], 'max diameter': [0, 0], 'average diameter': [0, 0],
     'min density': [1e5, 1e5], 'max density': [0, 0], 'average density': [0, 0]}
stats = pd.DataFrame(data=d)


for id in range(len(labels)):
  lbl = labels.loc[id,'target']
  graph = ego_graphs[str(id)]

  num_nodes = graph.number_of_nodes()
  num_edges = graph.number_of_edges()
  sparsity = num_edges / (num_nodes * (num_nodes - 1))
  diameter = nx.diameter(graph)
  density = nx.density(graph)

  if num_nodes<stats.loc[lbl,'min nodes']:
    stats.loc[lbl,'min nodes'] = num_nodes
  elif num_nodes>stats.loc[lbl,'max nodes']:
    stats.loc[lbl,'max nodes'] = num_nodes

  if num_edges<stats.loc[lbl,'min edges']:
    stats.loc[lbl,'min edges'] = num_edges
  elif num_edges>stats.loc[lbl,'max edges']:
    stats.loc[lbl,'max edges'] = num_edges

  if sparsity<stats.loc[lbl,'min sparsity']:
    stats.loc[lbl,'min sparsity'] = sparsity
  elif sparsity>stats.loc[lbl,'max sparsity']:
    stats.loc[lbl,'max sparsity'] = sparsity

  if diameter<stats.loc[lbl,'min diameter']:
    stats.loc[lbl,'min diameter'] = diameter
  elif diameter>stats.loc[lbl,'max diameter']:
    stats.loc[lbl,'max diameter'] = diameter

  if density<stats.loc[lbl,'min density']:
    stats.loc[lbl,'min density'] = density
  elif density>stats.loc[lbl,'max density']:
    stats.loc[lbl,'max density'] = density

  stats.loc[lbl,'average nodes'] += num_nodes
  stats.loc[lbl,'average edges'] += num_edges
  stats.loc[lbl,'average sparsity'] += sparsity
  stats.loc[lbl,'average diameter'] += diameter
  stats.loc[lbl,'average density'] += density

stats.loc[0,'average nodes'] /= len(index0)
stats.loc[1,'average nodes'] /= len(index1)
stats.loc[0,'average edges'] /= len(index0)
stats.loc[1,'average edges'] /= len(index1)
stats.loc[0,'average sparsity'] /= len(index0)
stats.loc[1,'average sparsity'] /= len(index1)
stats.loc[0,'average diameter'] /= len(index0)
stats.loc[1,'average diameter'] /= len(index1)
stats.loc[0,'average density'] /= len(index0)
stats.loc[1,'average density'] /= len(index1)

In [None]:
stats

### Extract Features

In [None]:
from networkx.algorithms.centrality import betweenness
def node_feature_extraction(G):
  '''
  INPUT:
  G: the graph

  OUTPUT:
  features: feature matrix of dimensions (N, D) (N: number of samples; D: number of features) 
  '''
  num_nodes = G.number_of_nodes()
  num_edges = G.number_of_edges()
  sparsity = num_edges / (num_nodes * (num_nodes - 1))
  diameter = nx.diameter(G)
  density = nx.density(G)

  features = np.array([num_nodes, num_edges, sparsity, diameter, density])

  degrees = G.degree()
  clustering = nx.clustering(G)
  betweenness = nx.betweenness_centrality(G)
  eigenvector = nx.eigenvector_centrality(G)

  features = np.concatenate((features,np.array([[val for (node, val) in degrees], list(clustering.values()), list(betweenness.values()), list(eigenvector.values())]).T[0]), axis=None)
  return features

In [None]:
targets = np.array(labels['target'])
features = np.zeros((len(labels), 9))
for id in range(len(labels)):
  graph = ego_graphs[str(id)]
  features[id] = node_feature_extraction(graph)

print(np.shape(targets))
print(np.shape(features))

### SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns


def classifier(features, targets, feature_selection, num_features, test_size, seed=0, verbose=False):
 
  # Split the data into training and testing sets, with test_size=0.5
  X_train, X_test, y_train, y_test = train_test_split(features, targets, stratify=targets, test_size=test_size, 
                                                      random_state=seed)

  if feature_selection:
    ## Build and train the ML model, including feature selection, normalization and Support Vector Classifier. Select the k highest relevant features for the classification. 
    selector = SelectKBest(f_classif, k=num_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    ## Print the scores for individual features.
    ## Plot the feature scores
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(selector.scores_)), selector.scores_)
    plt.xlabel('Feature Index')
    plt.ylabel('Score')
    plt.title('Feature Scores')
    plt.show()

  else:
    ## Build and train the ML model, including normalization and Support Vector Classifier.
    X_train_selected = X_train
    X_test_selected = X_test
  scaler = StandardScaler()
  X_train_norm = scaler.fit_transform(X_train_selected)
  X_test_norm = scaler.transform(X_test_selected)

  clf = svm.SVC(random_state=seed, class_weight='balanced')
  clf.fit(X_train_norm, y_train)

  # Use the model to predict the labels of the test data
  y_pred = clf.predict(X_test_norm)

  # Output the confusion matrix and weighted f1 score on the test set. Print the weighted f1 score and plot the confusion matrix if verbose
  cm = metrics.confusion_matrix(y_test, y_pred)
  f1 = metrics.f1_score(y_test, y_pred, average='weighted')
  if verbose:
      print('Weighted F1 score:', f1)
      # Plot the confusion matrix
      plt.figure(figsize=(8, 6))
      sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
      plt.xlabel('Predicted Label')
      plt.ylabel('True Label')
      plt.title('Confusion Matrix')
      plt.show()
  return f1

In [None]:
classifier(features, targets, feature_selection=True, num_features=4, 
           test_size=0.4, seed=0, verbose=True)

## Global Properties

In [None]:
#Connected components
connected_components = nx.connected_components(graph)
num_connected_components = len(list(connected_components))
print(f"Connected components: {num_connected_components}")

#Sparsity
num_nodes = graph.number_of_nodes()
num_edges = graph.number_of_edges()
sparsity = num_edges / (num_nodes * (num_nodes - 1))
print(f"Sparsity: {sparsity}")

#Diameter
diameter = nx.diameter(graph)
print(f"Diameter: {diameter}")

#Clustering
clusters = nx.clustering(graph)
print(f"Clusters: {clusters}")

#Degree distribution
degree_sequence = sorted([d for n, d in graph.degree()], reverse=True)
print(f"Degree distribution: {degree_sequence}")

#Spectrum (eigenvalues of adjacency matrix)
eigenvalues = nx.linalg.spectrum.adjacency_spectrum(graph)
print(f"Spectrum: {eigenvalues}")


# MLP Classifier

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Assuming you have the features matrix (X) and the target vector (y)
# X shape: (127094, #number of features) depends on what features you choose
# y shape: (127094,)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# Creating a neural network classifier
classifier = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=100, random_state=42)

# Training the classifier
training_loss = classifier.fit(X_train, y_train).loss_curve_
testing_loss = classifier.fit(X_test, y_test).loss_curve_

# Predicting on the training and testing sets
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

# Calculating accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

# Plotting the training and testing curves
plt.figure(figsize=(10, 6))
plt.plot(training_loss, label="Training Curve")
plt.plot(testing_loss, label="Testing Curve")
plt.xlabel("Epochs")
plt.ylabel("Loss/Cross-Validation Score")
plt.legend()
plt.title("Training and Testing Curves")
plt.show()


# Neural Network

In [None]:
import tensorflow as tf

# Step 1: Splitting the Data
X = features
y = targets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Setting up the GCN Model
num_features = X_train.shape[1]
num_classes = 2  # Binary classification

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(num_features,)),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 3: Training the Model
num_epochs = 10

model.fit(X_train, y_train, epochs=num_epochs, batch_size=32, validation_data=(X_test, y_test))

history = model.fit(X_train, y_train, epochs=num_epochs, batch_size=32, validation_data=(X_test, y_test))

# Get the training and testing accuracy values from the history
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

# Generate x-axis values for the accuracy curves
epochs = range(1, num_epochs + 1)

# Plot the training and testing accuracy curves
plt.plot(epochs, train_acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Testing Accuracy')
plt.title('Training and Testing Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()