In [None]:
# !pip install datasets
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import tensorflow as tf
from keras import layers, models
from keras.utils import to_categorical
import traceback

In [None]:
import os
import glob

def clear_temp_files():
    # Use glob to find all files matching the pattern 'temp_file_*.csv'
    temp_files = glob.glob('temp_file_*.csv')

    # Iterate through each file in the list of temporary files
    for file_path in temp_files:
        try:
            # Attempt to remove the file
            os.remove(file_path)
            # Print a message indicating that the file has been removed
            print(f"Removed {file_path}")
        except Exception as e:
            # If an exception occurs during file removal, print an error message
            print(f"Error occurred while deleting file {file_path}: {e}")

# Call the function to clear temporary files
clear_temp_files()


Removed temp_file_cae70c31-93b7-469b-a997-e8a478de75f5.csv
Removed temp_file_9d310ceb-710f-43f8-a0b8-a5371670d37d.csv
Removed temp_file_c1bc8f24-4176-4c21-ba81-271e29e73191.csv
Removed temp_file_5a0e633f-3403-4dbb-b21d-a2501e54b8fe.csv


In [None]:
import requests
import uuid

def get_download_url(file_path_or_url):
    # Check if the provided path or URL is from Google Drive
    if 'drive.google.com' in file_path_or_url:
        # Extract the file ID from the Google Drive URL
        file_id = file_path_or_url.split('/')[-2]
        # Construct the download URL with the file ID
        download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
        print(f"Downloading from {download_url}")

        # Create a session to handle the download request
        session = requests.Session()
        response = session.get(download_url, stream=True)

        # Check for the 'download_warning' cookie in the response
        token = None
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                token = value
                break

        # If there's a confirmation token, make another request including it
        if token:
            params = {'id': file_id, 'confirm': token}
            response = session.get(download_url, params=params, stream=True)

        # Check the response headers to confirm successful file download
        if 'content-disposition' in response.headers:
            # Generate a unique file name for the downloaded file
            unique_file_name = f'temp_file_{uuid.uuid4()}.csv'
            # Write the downloaded content to the unique file
            with open(unique_file_name, 'wb') as f:
                for chunk in response.iter_content(32768):
                    if chunk:  # Filter out keep-alive new chunks
                        f.write(chunk)
            # Update the file_path_or_url to the path of the downloaded file
            file_path_or_url = unique_file_name
        else:
            print("Failed to download the file from Google Drive.")
            return None

    # Return the updated file path or URL
    return file_path_or_url


In [None]:
import numpy as np

def calculate_homophily(adjacency_matrix, Y):
    # Print the shape of the attribute matrix Y (for debugging or information)
    print(Y.shape)

    # Initialize counters for same-attribute edges and total edges
    same_attribute_count = 0
    total_edge_count = np.sum(adjacency_matrix)

    # Check for zero division to avoid division by zero
    if total_edge_count == 0:
        return 0

    # Iterate over non-zero entries in the adjacency matrix
    for i, j in np.argwhere(adjacency_matrix > 0):
        # Check if nodes i and j have the same attribute
        if str(Y[i]) == str(Y[j]):
            # Increment the count of edges with the same attribute
            same_attribute_count += adjacency_matrix[i][j]

    # Calculate homophily as the ratio of same-attribute edges to total edges
    homophily = same_attribute_count / total_edge_count
    # Round the homophily percentage to two decimal places
    return round(homophily * 100, 2)


In [None]:
import pandas as pd
import networkx as nx
import numpy as np

def handle_datasets1(file_path_or_url, dataset):
    # Read the dataset from the specified file path or URL
    df = pd.read_csv(file_path_or_url, sep='\t', header=None, low_memory=False, encoding='unicode_escape')

    # Shuffle the rows of the DataFrame to randomize the order
    df = df.sample(frac=1).reset_index(drop=True)

    # Extract identifiers (assumed to be in the first column)
    identifiers = df.iloc[:, 0]

    # Create an empty undirected graph using NetworkX
    graph = nx.Graph()

    # Add nodes to the graph based on identifiers
    for node in identifiers:
        graph.add_node(str(node).strip())  # Assuming node identifiers are strings

    # Extract features (X) from the DataFrame, handling NaN values by replacing with 0
    X = df.iloc[:, 1:-1]
    X = np.nan_to_num(X, nan=0)

    # Extract labels (Y) from the last column of the DataFrame
    Y = df.iloc[:, -1]

    # Call the add_edges function to add edges to the graph based on the specified dataset
    adjacency_matrix = add_edges(dataset, datasets[dataset]['edges'], graph)

    # Return the extracted features, labels, identifiers, and adjacency matrix
    return X, Y, identifiers, adjacency_matrix


In [None]:
import pandas as pd
import networkx as nx
import numpy as np

def handle_datasets2(file_path_or_url, dataset):
    # Read the dataset from the specified file path or URL with handling of NaN values
    df = pd.read_csv(file_path_or_url, low_memory=False)

    # Drop rows with NaN values
    df = df.dropna()

    # Print the column names of the DataFrame
    print(df.columns)

    # Shuffle the rows of the DataFrame to randomize the order
    df = df.sample(frac=1).reset_index(drop=True)

    # Apply the string_to_float function to convert 'MedianIncome' to float
    df['MedianIncome'] = df.apply(string_to_float, axis=1)

    # Extract identifiers (assumed to be in the first column)
    identifiers = df.iloc[:, 0]

    # Create an empty undirected graph using NetworkX
    graph = nx.Graph()

    # Add nodes to the graph based on identifiers
    for node in identifiers:
        graph.add_node(str(node).strip())  # Assuming node identifiers are strings

    # Extract features (X) from the DataFrame, starting from the 5th column
    X = df.iloc[:, 4:]
    X = np.nan_to_num(X, nan=0)

    # Apply the compare_and_assign function to assign labels to each row
    df['label'] = df.apply(compare_and_assign, axis=1)

    # Extract labels (Y) from the 'label' column
    Y = df.loc[:, 'label']

    # Print the number of NA values in the 'label' column
    print('NA values: ', df['label'].isna().sum())

    # Call the add_edges function to add edges to the graph based on the specified dataset
    adjacency_matrix = add_edges(dataset, datasets[dataset]['edges'], graph)

    # Return the extracted features, labels, identifiers, and adjacency matrix
    return X, Y, identifiers, adjacency_matrix

def compare_and_assign(row):
    # Compare 'DEM' and 'GOP' values and assign label accordingly
    if pd.notna(row['DEM']) and pd.notna(row['GOP']):
        if row['DEM'] > row['GOP']:
            return 'DEM'
        elif row['GOP'] > row['DEM']:
            return 'GOP'
    return pd.NA

def string_to_float(row):
    # Convert 'MedianIncome' from string to float
    return float(''.join(row['MedianIncome'].split(',')))


In [None]:
import sys
import numpy as np
import scipy.sparse as sp
import networkx as nx
import pickle as pkl
import random

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def handle_datasets3(dataset):
    # Define the names of files corresponding to different data components
    names = ['tx', 'ty', 'allx', 'ally', 'graph']

    # Initialize a list to store loaded objects from files
    objects = []

    # Loop through each file name and load the corresponding object
    for name in names:
        # Get the file path or URL from the dataset dictionary
        file_path_or_url = datasets[dataset][name]

        # Call get_download_url to handle Google Drive links and download the file
        file_path = get_download_url(file_path_or_url)

        # Open the file and load the object using pickle
        with open(file_path, 'rb') as f:
            # Use pickle to load the object (handles Python 2 and 3)
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    # Unpack the loaded objects into individual variables
    tx, ty, allx, ally, graph = tuple(objects)

    # Stack feature matrices vertically and convert to List of Lists (lil) format
    features = sp.vstack((allx, tx)).tolil()

    # Stack label matrices vertically
    labels = np.vstack((ally, ty))

    # Create a NetworkX graph from the graph representation
    graph = nx.from_dict_of_lists(graph)

    # Shuffling
    nodes = list(graph.nodes())
    indices = list(range(len(nodes)))
    random.shuffle(indices)

    # Apply shuffling to features and labels using the shuffled indices
    features = features[indices, :]
    labels = labels[indices, :]

    # Relabel nodes in the graph based on shuffled indices
    shuffled_graph = nx.relabel_nodes(graph, {old: nodes[new] for new, old in enumerate(indices)})

    # Convert the shuffled graph to an adjacency matrix
    adj = nx.adjacency_matrix(shuffled_graph)

    # Return the processed features, labels, list of shuffled nodes, and adjacency matrix
    return features, pd.DataFrame(labels).idxmax(axis=1), list(shuffled_graph.nodes()), adj


In [None]:
def plot_edges(adj_matrix):
    # Ensure the matrix is a numpy array
    adj_matrix = np.array(adj_matrix)

    # Extract the edge weights (excluding diagonal elements)
    edge_weights = adj_matrix[np.triu_indices(adj_matrix.shape[0], 1)]

    # Determine the number of bins (rule of thumb: square root of the number of data points)
    num_bins = int(np.sqrt(len(edge_weights)))

    # Plotting the distribution of edge weights with a logarithmic scale
    plt.figure(figsize=(12, 7))

    # Use a log scale on y-axis if the bin count is zero
    plt.hist(edge_weights, bins=num_bins, color='blue', edgecolor='black', log=True)
    plt.title('Distribution of Edge Weights in Adjacency Matrix (Log Scale)')
    plt.xlabel('Edge Weight')
    plt.ylabel('Frequency (Log Scale)')
    plt.grid(True)
    plt.show()



In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

def create_adjacency_matrix(X, threshold=0.5):
    # Convert the input features to a NumPy array
    X = np.array(X)

    # Compute the cosine similarity matrix between feature vectors
    similarity_matrix = cosine_similarity(X)

    # Create an adjacency matrix by thresholding the similarity matrix
    adjacency_matrix = np.where(similarity_matrix > threshold, similarity_matrix, 0)

    return adjacency_matrix

def read_dataset(file_path_function, dataset_key):
    # Get the file path using the provided function and dataset key
    file_path = file_path_function(dataset_key)

    # Read the dataset from the specified file path
    return pd.read_csv(file_path, low_memory=False)

def visualise_better(adjacency_matrix):
    # Create a graph from the adjacency matrix
    G = nx.Graph(adjacency_matrix)

    # Layout the graph using the Kamada-Kawai layout algorithm
    pos = nx.kamada_kawai_layout(G)

    # Compute node degrees for adjusting node sizes
    degrees = np.array([val for (node, val) in G.degree()])
    node_size = (degrees / np.max(degrees)) * 1000

    # Visualize the graph with improved layout and adjusted node sizes
    nx.draw(G, pos, node_size=node_size, node_color='skyblue', font_size=8,
            font_color='black', font_weight='bold', with_labels=True, alpha=0.7)

    # Set plot title and display the graph
    plt.title("Improved Graph Visualization")
    plt.show()

def concatenate_datasets(dataset, keys):
    # Read datasets for the specified keys and concatenate them
    dataframes = [read_dataset(get_download_url, dataset[key]) for key in keys]
    return pd.concat(dataframes, ignore_index=True)

def handle_datasets4(file_path_or_url, datasetname):
    # Concatenate 'train' and 'dev' datasets along with their embeddings
    data = concatenate_datasets(datasets[datasetname], ['train', 'dev'])
    embeddings = concatenate_datasets(datasets[datasetname], ['trainembedding', 'devembedding'])

    # Extract labels and embeddings from the concatenated datasets
    Y = data['label']
    embeddings['embeddings'] = embeddings['embeddings'].apply(lambda x: np.array(eval(x)))
    X = np.vstack(embeddings['embeddings'].values)

    # Extract identifiers and create the adjacency matrix
    identifiers = data['id']
    adj = create_adjacency_matrix(X)

    # Visualize the graph edges for better understanding
    visualise_better(adj)

    # Return the processed features, labels, identifiers, and adjacency matrix
    return X, Y, identifiers, adj


In [None]:
import pandas as pd
import networkx as nx
import traceback  # Import traceback module for handling exceptions

def load_dataset(dataset):
    try:
        # Determine the file path or URL based on the dataset
        if dataset == 'pubmed' or 'sharedtask' in dataset:
            file_path_or_url = dataset
        else:
            file_path_or_url = get_download_url(datasets[dataset]['features'])
            print("File path: ", datasets[dataset]['features'])

        # Get the handler function for processing the dataset
        handler = datasets[dataset]['handler']

        # Call the handler function to load and process the dataset
        X, Y, identifiers, adjacency_matrix = handler(file_path_or_url, dataset)

        # Display information about the loaded dataset
        print("Number of nodes (X rows): ", X.shape[0])
        print("Number of features (X columns): ", X.shape[1])
        print("Number of classes or outputs (Y columns): ", Y.nunique())
        print("Number of edges in graph: ", adjacency_matrix.sum())
        print(f"Number of nodes in graph {adjacency_matrix.shape[0]}")

        # Calculate and display the homophily of the graph
        homophily_percentage = calculate_homophily(adjacency_matrix, Y)
        print(f"Homophily of graph {homophily_percentage}%")

        # Return the processed data
        return X, Y, identifiers, adjacency_matrix

    except FileNotFoundError:
        # Handle the case where the file is not found
        print(f"The file {dataset} was not found.")
    except Exception as e:
        # Handle other exceptions and print the traceback
        traceback.print_exc()


In [None]:
import pandas as pd
import numpy as np
import networkx as nx

def add_edges(dataset, edges_file_path, G):
    # Print a message indicating the start of loading the adjacency matrix
    print("Loading adjacency_matrix")

    # Get the file path or URL for the edges file
    file_path_or_url = get_download_url(edges_file_path)

    # Raise an exception if the file download fails
    if file_path_or_url is None:
        raise FileNotFoundError(f"Failed to download the dataset: {edges_file_path}")

    # Set to store erroneous nodes (nodes mentioned in the edges file but not present in the graph)
    erroneous_nodes = set()

    # Function to check if a node is present in the graph
    def check_node(name, G):
        if not G.has_node(name):
            erroneous_nodes.add(name)
            return False
        return True

    # Open the edges file and process each line
    with open(file_path_or_url, 'r') as file:
        for idx, line in enumerate(file):
            # Split the line into nodes based on the dataset type
            if dataset == 'cora' or dataset == 'citeseer':
                nodes = line.strip().split()
            elif dataset == 'us-county':
                # Skip the header line (first line) for the 'us-county' dataset
                if idx == 0:
                    continue
                nodes = line.strip().split(',')

            # Check if there are two nodes in the line
            if len(nodes) == 2:
                node1 = str(nodes[0])
                node2 = str(nodes[1])

                # Add an edge between the nodes if they exist in the graph
                if check_node(node1, G) and check_node(node2, G):
                    G.add_edge(node1, node2)

    # Print the number of erroneous nodes found
    print(len(erroneous_nodes), "erroneous nodes found..")

    # Get the adjacency matrix from the graph
    adjacency_matrix = nx.adjacency_matrix(G)

    # Return the adjacency matrix
    return adjacency_matrix


In [None]:
from sklearn.manifold import SpectralEmbedding
from sklearn.preprocessing import StandardScaler

def spectral_embedding(X, A):
    # Set the number of components for spectral embedding
    n_components = 5

    # Calculate the normalized Laplacian matrix
    D = np.diag(np.sum(A, axis=1))
    L = D - A

    # Calculate the spectral embedding using nearest neighbors affinity
    se = SpectralEmbedding(n_components=n_components, random_state=42, affinity='nearest_neighbors')
    embedding = se.fit_transform(X)

    # Standardize the spectral embedding features
    scaler = StandardScaler()
    embedding = scaler.fit_transform(embedding)

    return embedding

def train_linear_se_model(X_train, y_train, A):
    # Perform spectral embedding on the graph and concatenate with original features
    embedding = spectral_embedding(X_train, A)
    X_train_se = np.concatenate((X_train, embedding), axis=1)

    # Normalize the features
    X_train_se = normalize_features(X_train_se)

    # Train a linear model (Logistic Regression) on the enhanced features
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_se, y_train)

    return model

def train_mlp_se_model(X_train, y_train, A, validation_fraction):
    # Perform spectral embedding on the graph and concatenate with original features
    embedding = spectral_embedding(X_train, A)
    X_train_se = np.concatenate((X_train, embedding), axis=1)

    # Normalize the features
    X_train_se = normalize_features(X_train_se)

    # Train an MLP model with hyperparameter tuning and early stopping on the enhanced features
    parameters = {'hidden_layer_sizes': [(50,), (100,)],
                  'alpha': [0.001, 0.01],
                  'learning_rate_init': [0.001, 0.01]}

    mlp = MLPClassifier(max_iter=500, early_stopping=True, validation_fraction=validation_fraction)
    grid_search = GridSearchCV(mlp, parameters, cv=2, scoring='neg_log_loss', n_jobs=-1)
    grid_search.fit(X_train_se, y_train)

    # Select the best model from the grid search
    best_model = grid_search.best_estimator_

    return best_model


In [None]:

def train_linear_model(X_train, y_train):
    # Train a linear model (Logistic Regression)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

def train_mlp_model(X_train, y_train, validation_fraction):
    # Train an MLP model with hyperparameter tuning and early stopping
    parameters = {'hidden_layer_sizes': [(50,), (100,)],
                  'alpha': [0.001, 0.01],
                  'learning_rate_init': [0.001, 0.01]}

    mlp = MLPClassifier(max_iter=500, early_stopping=True, validation_fraction=validation_fraction)
    # Note: validation_fraction specifies the proportion of training data to set aside as the validation set for early stopping

    grid_search = GridSearchCV(mlp, parameters, cv=2, scoring='neg_log_loss', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    # Select the best model from the grid search
    best_model = grid_search.best_estimator_

    return best_model

def train_base_predictor(X_train, y_train, X, model_type='linear', adj=np.zeros((1, 1)), validation_fraction=0.1):
    if model_type == 'linear':
        # Train a linear model and return the predicted probabilities
        base_model = train_linear_model(X_train, y_train)
    elif model_type == 'mlp':
        # Train an MLP model and return the predicted probabilities
        base_model = train_mlp_model(X_train, y_train, validation_fraction)
    elif model_type == 'linear_se':
        # Train a linear model with spectral embedding and return the predicted probabilities
        base_model = train_linear_se_model(X_train, y_train, adj)
    elif model_type == 'mlp_se':
        # Train an MLP model with spectral embedding and return the predicted probabilities
        base_model = train_mlp_se_model(X_train, y_train, adj, validation_fraction)
    else:
        raise ValueError("Invalid model_type. Choose 'linear', 'mlp', 'linear_se', or 'mlp_se'.")

    # If the model is linear or MLP, predict probabilities using the original or enhanced features
    if model_type == 'linear' or model_type == 'mlp':
        return base_model.predict_proba(X)
    else:
        # For spectral embedding models, concatenate with spectral embedding, normalize, and predict probabilities
        X_expand = np.concatenate((X, spectral_embedding(X, adj)), axis=1)
        X_expand = normalize_features(X_expand)
        return base_model.predict_proba(X_expand)


In [None]:
# ERROR PROPAGATION

def calculate_normalized_adjacency_matrix(A):
    # Calculate the normalized adjacency matrix S from the original adjacency matrix A
    D = np.sum(A, axis=1)
    D_sqrt_inv = np.diag(1. / np.sqrt(D))
    D_sqrt_inv[np.isinf(D_sqrt_inv)] = 0
    S = D_sqrt_inv @ A @ D_sqrt_inv
    return S

def calculate_residuals(Y, Z):
    # Calculate the residuals (difference) between true labels Y and predicted labels Z
    E = Y - Z
    return E

def label_spreading(E, S, alpha, max_iter=50):
    # Perform label spreading to correct errors in predictions
    W = np.copy(E)
    for _ in range(max_iter):
        W_new = (1 - alpha) * W + alpha * np.dot(S, W)
        diff = np.linalg.norm(W - W_new)
        if diff < 0.0001:
            break
        W = W_new
    return W

def correct_predictions(Z, E_hat):
    # Correct predictions by adding the error estimates (E_hat) to the original predictions (Z)
    Z_corrected = Z + E_hat
    return Z_corrected

def error_propagation(Z, Y, A, train_end, validation_end, alpha_values=[0.001, 0.01, 0.02, 0.05, 0.1], max_iter=100):
    # Error propagation algorithm to refine predictions and find the best alpha value

    # Calculate the normalized adjacency matrix S
    S = calculate_normalized_adjacency_matrix(A)

    # Calculate residuals (errors) between true labels Y and initial predictions Z
    E = calculate_residuals(Y, Z)

    # Set errors in the validation and test sets to zero
    E_with_val = np.copy(E)
    E_with_val[validation_end:, :] = 0
    E[train_end:, :] = 0

    # Best alpha initialization
    best_alpha = None
    max_acc = 0
    output_values = None

    # Iterate over possible alpha values to find the best one
    for alpha in alpha_values:
        F = np.copy(E)
        F_hat = label_spreading(F, S, alpha, max_iter)
        Z_corrected = correct_predictions(np.copy(Z), F_hat)

        # Compute accuracy on the validation set
        validation_acc = accuracy_score(np.argmax(Y[train_end:validation_end], axis=1),
                                         np.argmax(Z_corrected[train_end:validation_end], axis=1))
        print(f"alpha {alpha}, validation_acc {validation_acc}")

        # Update best alpha if needed
        if validation_acc >= max_acc:
            max_acc = validation_acc
            best_alpha = alpha
            output_values = (Z_corrected, F_hat, F)

    # Re-run error propagation using the best alpha on the entire validation set
    F = np.copy(E_with_val)
    F_hat = label_spreading(F, S, best_alpha, max_iter)
    Z_corrected = correct_predictions(np.copy(Z), F_hat)
    output_values = (Z_corrected, F_hat, F)

    print("Best alpha:", best_alpha)
    return output_values


In [None]:
# SCALING

def autoscale(E_hat, E, L):
    # Autoscale the error estimates E_hat based on the residuals E
    E_auto = np.copy(E_hat)

    # Reshape E_auto if it's 1-dimensional
    if E_hat.ndim == 1:
        E_auto = E_auto.reshape(-1, 1)

    # Compute sigma as the mean L1 norm of residuals for training instances
    sigma = np.mean(np.linalg.norm(E[:L], ord=1, axis=1))
    sigma = max(sigma, 1e-10)  # Avoid division by zero

    # Compute the L1 norm of the error estimates for validation and test instances
    norm_E_hat = np.linalg.norm(E_auto[L:], ord=1, axis=1, keepdims=True)
    norm_E_hat = np.maximum(norm_E_hat, 1e-10)  # Avoid division by zero

    # Scale the error estimates based on the computed sigma
    E_auto[L:] *= (sigma / norm_E_hat)

    return E_auto


def scaled_fixed_diffusion(E, A, val_end, max_iter=100, error_margin=1e-6):
    # Scaled Fixed Diffusion to adjust the scale of the residual
    D = np.diag(np.sum(A, axis=0))
    D_inv = np.linalg.inv(D)
    E_fixed = np.copy(E)

    for _ in range(max_iter):
        E_fixed[:val_end] = E[:val_end]
        tmp = A @ E_fixed
        res = D_inv @ tmp
        E_fixed[val_end:] = res[val_end:]

    return E_fixed

def apply_scaling_methods(Z_corrected, E_hat, E, scaling_type, A, val_end):
    # Apply scaling methods and correct the predictions

    if scaling_type == 'autoscale':
        scaled_autoscale = autoscale(E_hat, E, val_end)
        return Z_corrected + scaled_autoscale
    elif scaling_type == 'fixed_diffusion':
        scaled_fixed_diff = scaled_fixed_diffusion(E_hat, A, val_end)
        return Z_corrected + scaled_fixed_diff


In [None]:
# SMOOTHING FINAL PREDICTIONS

def final_label_propagation(Z_corrected, Y, A, train_end, validation_end, alpha_values=[0.001, 0.01, 0.02, 0.05, 0.1], max_iter=100):
    # Function to perform final label propagation to smooth the predictions

    def propagate_labels(H, S, alpha, max_iter):
        # Propagate labels using a label propagation algorithm
        for _ in range(max_iter):
            H_new = (1 - alpha) * H + alpha * np.dot(S, H)
            diff = np.linalg.norm(H - H_new)
            if diff < 0.0001:
                break
            H = H_new
        return H

    # Calculate the normalized adjacency matrix
    S = calculate_normalized_adjacency_matrix(A)

    # Initialize the result matrix as a copy of corrected predictions
    H = np.copy(Z_corrected)

    # Perform label propagation with a specific alpha value (e.g., 0.01)
    H = propagate_labels(H, S, 0.01, max_iter)

    # Keep the training labels intact (up to the validation_end index)
    H[:validation_end] = Y[:validation_end]

    return H


In [None]:
# EVALUATE ACCURACIES

def evaluate_accuracies(Y, Z, name, val_end):
    """
    Evaluate accuracy and print the result.

    Parameters:
    - Y: True labels
    - Z: Predicted labels
    - name: Name or identifier for the evaluation method
    - val_end: Index indicating the end of the validation set

    Returns:
    - acc: Accuracy as a percentage
    """
    # Calculate accuracy using true and predicted labels
    acc = accuracy_score(Y[val_end:], np.argmax(Z[val_end:], axis=1))

    # Round accuracy to two decimal places
    acc = round(acc * 100, 2)

    # Print the accuracy result
    print(f"Accuracy after {name}: {acc}%")

    return acc


In [None]:
# NORMALIZE FEATURES

from sklearn.preprocessing import MinMaxScaler, StandardScaler

def normalize_features(X):
    """
    Normalize the features of a given dataset.

    Parameters:
    - X: Input feature matrix

    Returns:
    - X_normalized: Normalized feature matrix
    """
    # Create a MinMaxScaler object
    min_max_scaler = MinMaxScaler()

    # Fit and transform the input features using MinMaxScaler
    X_normalized = min_max_scaler.fit_transform(X)

    # Return the normalized feature matrix
    return X_normalized


In [None]:
# Define a dictionary containing information about different datasets
datasets = {
    # 'citeseer': {
    #     'features':"https://drive.google.com/file/d/1Jxb4mR8sT92Rc2tjBZf2jTo9X9g19iNh/view?usp=drive_link",
    #     'edges':"https://drive.google.com/file/d/15VeHRUjnCuGCqkz3fieLAivNl35C9bG-/view?usp=drive_link",
    #     'train_split': 0.6,
    #     'test_split': 0.2,
    #     'validation_split': 0.2,
    #     'handler':handle_datasets1
    # },
    # 'us-county': {
    #     'features':"https://drive.google.com/file/d/1OKa4otThbZWZq2cscmN-4MC0zYaG2hXF/view?usp=drive_link",
    #     'edges':"https://drive.google.com/file/d/1MRS3XcqAYeV53EFAP-FPhghxkU8uaPSR/view?usp=drive_link",
    #     'train_split': 0.4,
    #     'test_split': 0.5,
    #     'validation_split': 0.1,
    #     'handler':handle_datasets2

    # },
    #     'cora': {
    #     'features':"https://drive.google.com/file/d/1vypw02B3Rq6Knen9JBu0kZ-PPbCEQsbJ/view?usp=drive_link",
    #     'edges':"https://drive.google.com/file/d/16S7tezMQK6ksqtCxhQwyBkP99i5IdYKE/view?usp=drive_link",
        # 'train_split': 0.6,
        # 'test_split': 0.2,
        # 'validation_split': 0.2,
    #     'handler':handle_datasets2
    # },
    # 'pubmed': {
    #     'tx':"https://drive.google.com/file/d/1eNbbzB5h0kmqNt1Lk6sTMeX1V3RO52DE/view?usp=drive_link",
    #     'ty':"https://drive.google.com/file/d/1Sfr4TzMrO4iFUGTmoXumNsccVHSsoAua/view?usp=drive_link",
    #     'graph':"https://drive.google.com/file/d/1T-ImSb-X9KgiIvcA-Aj9hYh8AWoh103S/view?usp=drive_link",
    #     'allx':"https://drive.google.com/file/d/1tg2qFpzJrNkkSC3OkaktHs4joshwi2Lq/view?usp=drive_link",
    #     'ally':"https://drive.google.com/file/d/1sX27bffBGa7TKh_Uoc6qHAyex7biHxoP/view?usp=drive_link",

    #     'train_split': 0.6,
    #     'test_split': 0.2,
    #     'validation_split': 0.2,
    #     'handler':handle_datasets3
    # },

    # 'sharedtasksgcn': {
    #     'dev':"https://drive.google.com/file/d/1j3shDaPEq8RUeDUz1X0pc6I0jGABpGyF/view?usp=drive_link",
    #     'train':"https://drive.google.com/file/d/15XtL2MReWk1s_Hw5FzUdhEUX8OByxKaQ/view?usp=drive_link",
    #     'devembedding':"https://drive.google.com/file/d/180Krz-rID0XeTCyIeKeleY4AHwexo8n2/view?usp=drive_link",
    #     'trainembedding':"https://drive.google.com/file/d/1SVth-3S3PewWJjFd_Qny_Mm2UJNtAzFT/view?usp=drive_link",
    #     'handler':handle_datasets4,
    #     'train_split':0.85,
    #     'test_split': 0.075,
    #     'validation_split': 0.075,
    # },

    'sharedtaskbert': {
        'dev':"https://drive.google.com/file/d/1j3shDaPEq8RUeDUz1X0pc6I0jGABpGyF/view?usp=drive_link",
        'train':"https://drive.google.com/file/d/15XtL2MReWk1s_Hw5FzUdhEUX8OByxKaQ/view?usp=drive_link",
        'devembedding':"https://drive.google.com/file/d/1xElM3C6JiRso0yn94XLGROA9qtl6Ayqp/view?usp=drive_link",
        'trainembedding':"https://drive.google.com/file/d/1RzdRA2zYjeM9bZNCRUNRNBCIwR5zd7wv/view?usp=drive_link",
        'handler':handle_datasets4,
        'train_split':0.85,
        'test_split': 0.075,
        'validation_split': 0.075,

    },
}


import pandas as pd
import time
import matplotlib.pyplot as plt


# Dictionary to store accuracies
accuracies = {}

# Iterate through each dataset in the dictionary
for dataset_name, dataset_info in datasets.items():
    print(f"\n\n--- Training on {dataset_name} dataset ---")

    # Load the dataset
    print("Loading dataset...")
    X, Y, identifiers,adjacency_matrix = load_dataset(dataset_name)
    X=normalize_features(X)

    # Preprocessing labels
    print("Preprocessing labels...")
    label_encoder = LabelEncoder()
    Y_numerical = label_encoder.fit_transform(Y)
    Y_onehot = to_categorical(Y_numerical)

    # Calculate the split indices
    train_end=  int(dataset_info['train_split'] * X.shape[0])
    validation_end = int((dataset_info['train_split'] +dataset_info['validation_split']) * X.shape[0])

    # Plot label distributions
    fig, axs = plt.subplots(2, 2, figsize=(12, 8))

    Y[:train_end].value_counts().plot(kind='bar', title=dataset_name + ' Training label distribution', ax=axs[0, 0])
    Y[train_end:validation_end].value_counts().plot(kind='bar', title=dataset_name + ' Validation label distribution', ax=axs[0, 1])
    Y[validation_end:].value_counts().plot(kind='bar', title=dataset_name + ' Testing label distribution', ax=axs[1, 0])
    Y.value_counts().plot(kind='bar', title=dataset_name + ' label distribution', ax=axs[1, 1])

    fig.text(0.5, -0.1, f'Number of Nodes: {X.shape[0]}', ha='center', va='top')
    fig.text(0.5, -0.15, f'Homophily: {calculate_homophily(adjacency_matrix, Y)}%', ha='center', va='top')

    plt.tight_layout()
    plt.show()

    # Iterate through each model type
    for model_type in ['linear', 'mlp', 'linear_se', 'mlp_se']:
        accuracies[(dataset_name, model_type)] = {}

        # Split the datasets
        X_train = X[:train_end]
        Y_train = Y_numerical[:train_end]

        X_train_val = X[:validation_end]
        Y_train_val = Y_numerical[:validation_end]

        X_test = X[validation_end:]
        Y_test_onehot = Y_onehot[validation_end:]


        # Train base predictor and evaluate
        print(f"Training {model_type.upper()} base predictor...")
        start_time = time.time()
        Z = train_base_predictor(X_train_val if ('mlp' in model_type) else X_train, Y_train_val if ('mlp' in model_type) else Y_train, X, model_type, adj=adjacency_matrix if ('se' in model_type) else None, validation_fraction=dataset_info['validation_split'] / dataset_info['train_split'] if (model_type == 'mlp' or model_type == 'mlp_se') else None)
        end_time = time.time()
        print(f"Time taken for {model_type.upper()} base predictor training: {end_time - start_time} seconds")
        accuracies[(dataset_name, model_type)]['base'] = evaluate_accuracies(Y_numerical, Z, f"{model_type.upper()} base", validation_end)

        # Error propagation
        print(f"Performing error propagation for {model_type}...")
        start_time = time.time()
        Z_corrected, E_hat, E = error_propagation(Z, Y_onehot, adjacency_matrix, train_end,validation_end)
        end_time = time.time()
        print(f"Time taken for error propagation for {model_type}: {end_time - start_time} seconds")
        accuracies[(dataset_name, model_type)]['error_propagation'] = evaluate_accuracies(Y_numerical, Z_corrected, f"{model_type} error propagation", validation_end)

        # Scaling
        scaling_methods=['autoscale','fixed_diffusion']
        for scaling_method in scaling_methods:
          name=model_type+" "+ scaling_method
          print(f"Applying scaling method : {name}...")
          start_time = time.time()
          Z_scaled = apply_scaling_methods(Z_corrected, E_hat, E,scaling_method, adjacency_matrix, validation_end)
          end_time = time.time()
          print(f"Time taken for scaling method for {name} : {end_time - start_time} seconds")
          accuracies[(dataset_name, model_type)][scaling_method] = evaluate_accuracies(Y_numerical, Z_scaled, name, validation_end)

          # Smoothing of final predictions
          print(f"Performing final label propagation smoothing for {name}...")
          start_time = time.time()
          Ypred = final_label_propagation(Z_scaled, Y_onehot, adjacency_matrix, train_end,validation_end,max_iter= 100)
          end_time = time.time()
          print(f"Time taken for final label propagation smoothing for {name} {end_time - start_time} seconds")
          accuracies[(dataset_name, model_type)]["Label Propagation "+scaling_method] = evaluate_accuracies(Y_numerical, Ypred, "Label Propagation "+name, validation_end)




--- Training on sharedtaskbert dataset ---
Loading dataset...
Downloading from https://drive.google.com/uc?export=download&id=15XtL2MReWk1s_Hw5FzUdhEUX8OByxKaQ
Downloading from https://drive.google.com/uc?export=download&id=1j3shDaPEq8RUeDUz1X0pc6I0jGABpGyF
Downloading from https://drive.google.com/uc?export=download&id=1RzdRA2zYjeM9bZNCRUNRNBCIwR5zd7wv
Downloading from https://drive.google.com/uc?export=download&id=1xElM3C6JiRso0yn94XLGROA9qtl6Ayqp


In [None]:
import pandas as pd
import numpy as np

# Function to highlight maximum values in specified rows of a DataFrame
def highlight_max_in_rows(df, color='lightgreen'):
    attr = f'background-color: {color}; color: black'

    # Initialize a DataFrame with empty strings
    result = pd.DataFrame('', index=df.index, columns=df.columns)

    # Identify rows to highlight (usually the last two rows if more than 5 rows)
    row_indices = [3, 5] if len(df) >= 6 else df.index[-2:]

    # Iterate through columns in groups of 4
    for i in range(0, len(df.columns), 4):
        group_columns = df.columns[i:i+4]

        # Find the maximum value in the specified rows and columns
        max_value = df[group_columns].iloc[row_indices].max().max()

        # Highlight the maximum values in the result DataFrame
        for row_idx in row_indices:
            row = df.index[row_idx]
            result.loc[row, group_columns] = [attr if df.at[row, col] == max_value else '' for col in group_columns]
    return result


# Function to display accuracies DataFrame with styling
def display_accuracies(accuracies):

    # Convert the accuracies dictionary to a DataFrame
    df = pd.DataFrame(accuracies)

    # Apply basic styling (black text on white background)
    styled_df = df.style.apply(lambda x: ['color: black'] * len(x), axis=1)\
                         .apply(lambda x: ['background-color: white'] * len(x), axis=1)\
                         .apply(highlight_max_in_rows, axis=None)

    # Define border styles for every 4th column to separate groups
    border_styles = []
    for i in range(5, len(df.columns) + 1, 4):
        border_styles.append({'selector': f'th:nth-child({i}), td:nth-child({i})',
                              'props': [('border-right', '2px solid')]})

    # Apply additional styling for highlighted cells
    styled_df.set_table_styles([
        {'selector': 'th', 'props': [('background-color', 'black'), ('color', 'white')]},
        {'selector': 'td', 'props': [('font-weight', 'bold')]}
    ] + border_styles, overwrite=False)

    return styled_df

In [None]:
display_accuracies(accuracies)