<a href="https://colab.research.google.com/github/sahilsait/credit-risk-assessment-using-GNNs/blob/main/graph_construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install torch-geometric



In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.preprocessing import StandardScaler

### Create similarity matrix between financial indicators using cosine similarity

In [19]:
df = pd.read_csv('/content/drive/MyDrive/datasets/preprocessed_data.csv')
features = [
        'at', 'lt', 'ceq', 'che', 'dltt', 'dlc',
        'act', 'lct', 'invt', 'sale', 'oibdp',
        'ni', 'xint', 'oancf', 'debt_total',
        'short_term_debt_ratio', 'interest_coverage', 'roa',
        'current_ratio', 'quick_ratio'
]

In [20]:
def create_similarity_matrix(df, features):
  data = df[features].values

  # Calculate cosine similarity matrix
  n_features = len(features)
  similarity_matrix = np.zeros((n_features, n_features))

  # Calculate cosine similarity safely
  for i in range(n_features):
    for j in range(n_features):
    # Get vectors for current pair of features
      vec1 = data[:, i]
      vec2 = data[:, j]

     # Calculate cosine similarity
      numerator = np.dot(vec1, vec2)
      denominator = np.linalg.norm(vec1) * np.linalg.norm(vec2)

      # Safe division
      if denominator > 0:
        similarity_matrix[i, j] = numerator / denominator
      else:
        similarity_matrix[i, j] = 0.0

  return similarity_matrix

similarity_matrix = create_similarity_matrix(df, features)

In [21]:
print("Similarity Matrix Properties:")
print(f"Shape: {similarity_matrix.shape}")  # Should be (20, 20)
print(f"Diagonal values: {np.diagonal(similarity_matrix)}")  # Should be all close to 1
print(f"Value range: [{similarity_matrix.min():.3f}, {similarity_matrix.max():.3f}]")  # Should be [-1, 1]

Similarity Matrix Properties:
Shape: (20, 20)
Diagonal values: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Value range: [-0.263, 1.000]


### Create maximum spanning tree from similarity matrix

In [22]:
def create_maximum_spanning_tree(similarity_matrix):
    """
    Create maximum spanning tree from similarity matrix

    Process:
    1. Convert similarities to distances (negative)
    2. Find minimum spanning tree (same as maximum with negative weights)
    3. Convert to edge indices for PyTorch Geometric
    """
    # Convert to negative for minimum spanning tree
    # (since we want maximum similarities)
    negative_similarities = -similarity_matrix

    # Find minimum spanning tree
    mst = minimum_spanning_tree(negative_similarities)

    # Convert to dense format
    mst = mst.toarray()

    # Get edges from MST (nonzero elements)
    edges = np.nonzero(mst)

    # Create bidirectional edges (both directions)
    edge_index = np.concatenate([edges, edges[::-1]], axis=1)

    # Convert to torch tensor
    edge_index = torch.tensor(edge_index, dtype=torch.long)

    print("\nMST Statistics:")
    print(f"Number of nodes: {similarity_matrix.shape[0]}")
    print(f"Number of edges: {edge_index.shape[1]}")

    return edge_index

edge_index = create_maximum_spanning_tree(similarity_matrix)


MST Statistics:
Number of nodes: 20
Number of edges: 38


### Create PyG graph data object for a single company

In [23]:
def create_graph_data(company_data, features):
    """
    Create PyG Data object for one company
    """
    # 1. Create feature matrix (x)
    x = torch.tensor(company_data[features].values, dtype=torch.float).reshape(20, 1)
    if len(x.shape) == 1:
        x = x.reshape(-1, 1)  # Shape: [num_nodes(20), 1]

    # 2. Create similarity matrix
    similarity_matrix = create_similarity_matrix(company_data, features)

    # 3. Create edge_index using MST
    edge_index = create_maximum_spanning_tree(similarity_matrix)

    # 4. Create PyG Data object
    data = Data(
        x=x,                    # Node features: [num_nodes, 1]
        edge_index=edge_index   # Edge connections: [2, num_edges]
    )

    # 5. Verify the graph structure
    print("\nGraph Structure Verification:")
    print(f"Number of nodes: {data.num_nodes}")        # Should be 20
    print(f"Number of edges: {data.num_edges}")        # Should be 38 (19*2)
    print(f"Feature shape: {data.x.shape}")           # Should be [20, 1]
    print(f"Edge index shape: {data.edge_index.shape}")# Should be [2, 38]

    return data

In [24]:
def create_all_graphs(df, features):
    """
    Create graphs for all companies
    """
    graph_data_list = []

    # Group by company and year
    for (company, year), company_data in df.groupby(['gvkey', 'fyear']):
        try:
            # Create graph for this company-year
            graph = create_graph_data(company_data, features)
            graph_data_list.append(graph)

        except Exception as e:
            print(f"Error processing company {company} for year {year}: {e}")
            continue

    print(f"\nCreated {len(graph_data_list)} graphs")
    return graph_data_list

In [25]:
graphs = create_all_graphs(df, features)

# Verify first graph
first_graph = graphs[0]
print(first_graph)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Number of edges: 38

Graph Structure Verification:
Number of nodes: 20
Number of edges: 38
Feature shape: torch.Size([20, 1])
Edge index shape: torch.Size([2, 38])

MST Statistics:
Number of nodes: 20
Number of edges: 38

Graph Structure Verification:
Number of nodes: 20
Number of edges: 38
Feature shape: torch.Size([20, 1])
Edge index shape: torch.Size([2, 38])

MST Statistics:
Number of nodes: 20
Number of edges: 38

Graph Structure Verification:
Number of nodes: 20
Number of edges: 38
Feature shape: torch.Size([20, 1])
Edge index shape: torch.Size([2, 38])

MST Statistics:
Number of nodes: 20
Number of edges: 38

Graph Structure Verification:
Number of nodes: 20
Number of edges: 38
Feature shape: torch.Size([20, 1])
Edge index shape: torch.Size([2, 38])

MST Statistics:
Number of nodes: 20
Number of edges: 38

Graph Structure Verification:
Number of nodes: 20
Number of edges: 38
Feature shape: torch.Size([20, 1])
Edge 

In [26]:
torch.save(graphs, '/content/drive/MyDrive/datasets/graphs.pt')