# Setup for GNN Training

In [1]:
# mount to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# move to the working folder
FOLDERNAME = 'network_processing'
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/network_processing


In [3]:
# # install torch_geometric if necessary
# !pip install torch_geometric

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tqdm import tqdm
import re
import random

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

import networkx as nx
import matplotlib.pyplot as plt

In [5]:
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
print('Device', device)

Device cpu


# Build GNN Model

In [6]:
# build the GAT model
class GAT(nn.Module):
    def __init__(self, input_dim, gat_hidden_dim, output_dim, num_gat_layers, num_heads):
        super(GAT, self).__init__()
        self.gat_layers = nn.ModuleList([
            GATConv(input_dim if i == 0 else gat_hidden_dim * num_heads, gat_hidden_dim, heads=num_heads, dropout=0.2)
            for i in range(num_gat_layers)
        ])
        self.predict = nn.Linear(gat_hidden_dim * num_heads, output_dim)
        self.edge_weight = None
        self.edge = None

    def forward(self, x, edge_index):
        self.edge = edge_index
        for gat in self.gat_layers:
            x, weight = gat(x, edge_index, return_attention_weights=True)
            self.edge_weight = weight[1]
            self.edge = weight[0]
            x = F.relu(x)
        x = self.predict(x)
        return x

    def get_edge_weight(self):
        return self.edge_weight

    def get_edge(self):
        return self.edge

# Data Preprocessing

In [7]:
# load the company relationship data
file_path = './Company_Relationship.xlsx'
data = pd.read_excel(file_path, sheet_name='Total_Network', header=None)

In [8]:
# process the company relationship data
edges = []
for i in range(data.shape[0]):
    for row_index, j in enumerate(data.iloc[i, 3:].dropna().to_list()):
        numbers = re.findall(r'-?\d+', j)
        if len(numbers) < 2:
            continue
        start, end = int(numbers[0]), int(numbers[1])
        edges.append((start, end))

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

In [9]:
# load the output data from imaging processing and extract features
output_path = './ResNet_output_vectors_training.csv'
df_image = pd.read_csv(output_path)

stocks = df_image['stock']
dates = df_image['date']
vectors = df_image['vector'].apply(lambda x: np.fromstring(x.strip('[]'), sep=',')).tolist()

In [10]:
# Load the macroeconomic data
macro_path = './normalized_macro.csv'
df_macro = pd.read_csv(macro_path)

# Ensure the date formats are consistent
df_macro['Date'] = pd.to_datetime(df_macro['Date'])
df_image['date'] = pd.to_datetime(df_image['date'])

# Merge the ResNet data and macroeconomic data on date
merged_df = pd.merge(df_image, df_macro, left_on='date', right_on='Date', how='inner')

# Drop the 'Date' column from macro data as it's redundant after merging
merged_df = merged_df.drop(columns=['Date'])

# Extract combined vectors without titles
vectors_combined = []
for idx, row in merged_df.iterrows():
    resnet_vector = np.fromstring(row['vector'].strip('[]'), sep=',')
    macro_vector = row[['USD/TWD', 'VIX', ' Crude Oil', 'Gold', 'CPI', 'Unemployment Rate', 'Interest Rate', 'M2']].values.astype(float)
    combined_vector = np.concatenate([resnet_vector, macro_vector])
    vectors_combined.append(combined_vector)

In [11]:
# Create a DataFrame for the combined data
combined_df = pd.DataFrame({
    'stock': merged_df['stock'],
    'date': merged_df['date'],
    'vector': vectors_combined
})

# Convert vectors to the required string format
combined_df['vector'] = combined_df['vector'].apply(lambda x: ','.join(map(str, x)))

# Prepare the graph data
node_features = torch.tensor(vectors_combined, dtype=torch.float)
data = Data(x=node_features, edge_index=edge_index)

  node_features = torch.tensor(vectors_combined, dtype=torch.float)


In [12]:
print(combined_df)

      stock       date                                             vector
0      2912 2017-02-10  -0.4871345,1.0913699,-1.3290397,0.2241055,0.25...
1      3008 2017-02-10  -0.29821175,0.1569825,-0.2565347,0.25301692,0....
2      3045 2017-02-10  -0.40963504,2.0191028,-2.5389774,1.0132645,0.6...
3      3481 2017-02-10  -0.35229602,0.007964615,-0.03224269,-0.0162423...
4      3711 2017-02-10  -0.3067895,0.4327159,-0.3387887,0.09546754,0.1...
...     ...        ...                                                ...
7504   1101 2017-01-17  -0.42047712,0.9486824,-1.0586202,0.5972066,0.5...
7505   1102 2017-01-17  -0.6562908,2.2380335,-2.0883741,1.2907951,0.72...
7506   1216 2017-01-17  -0.44098958,0.27151826,-0.04504534,-0.24088496...
7507   1301 2017-01-17  -0.4780133,1.2899575,-1.0223458,0.851885,0.473...
7508   1303 2017-01-17  -0.13169901,0.48342395,-0.64879555,0.4947278,0...

[7509 rows x 3 columns]


# Training

In [13]:
# initialize the GAT model
input_dim = node_features.size(1)
gat_hidden_dim = 10
output_dim = 1
num_gat_layers = 2
num_heads = 2
model = GAT(input_dim, gat_hidden_dim, output_dim, num_gat_layers, num_heads)

In [14]:
# moce data and model to device
data = data.to(device)
model = model.to(device)

In [15]:
# constants for model training
NUM_EPOCHS = 100
PRINT_EVERY = 10

In [16]:
# define loss and optimizer
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [17]:
# label for testing accuracy
label = torch.randint(0, 2, (node_features.size(0),), dtype=torch.float).unsqueeze(1)  # dummy labels for test

In [18]:
# training loop
model.train()
for epoch in range(NUM_EPOCHS):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_function(out, label)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        predictions = torch.sigmoid(out)
        predicted_labels = (predictions > 0.5).float()
        correct = (predicted_labels == label).sum().item()
        total = label.size(0)
        accuracy = correct / total

    if (epoch + 1) % PRINT_EVERY == 0:
        print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {loss.item()}, Training Accuracy: {accuracy * 100:.2f}%')

print('Training completed.')

Epoch 10/100, Loss: 0.6932531595230103, Training Accuracy: 50.01%
Epoch 20/100, Loss: 0.6924663186073303, Training Accuracy: 51.21%
Epoch 30/100, Loss: 0.691675066947937, Training Accuracy: 52.18%
Epoch 40/100, Loss: 0.6917874217033386, Training Accuracy: 51.54%
Epoch 50/100, Loss: 0.6904712319374084, Training Accuracy: 52.90%
Epoch 60/100, Loss: 0.6900879144668579, Training Accuracy: 53.23%
Epoch 70/100, Loss: 0.689365804195404, Training Accuracy: 52.72%
Epoch 80/100, Loss: 0.6897796988487244, Training Accuracy: 53.55%
Epoch 90/100, Loss: 0.6877066493034363, Training Accuracy: 53.56%
Epoch 100/100, Loss: 0.6856527924537659, Training Accuracy: 53.91%
Training completed.


In [19]:
# Inference to obtain final vectors
model.eval()
with torch.no_grad():
    final_vectors = model(data.x, data.edge_index)

# Convert predictions to a single vector per stock
final_vectors = final_vectors.squeeze().cpu().numpy()

# Adjust the length of stocks and dates to match final_vectors
original_length = len(final_vectors)

# Filter stocks and dates to match the number of final_vectors
stocks_filtered = stocks[:original_length]
dates_filtered = dates[:original_length]

# prepare the output data for CSV
output_data = {
    "stock": stocks_filtered,
    "date": dates_filtered,
    "vector": final_vectors
}

output_df = pd.DataFrame(output_data)
output_df.to_csv('training_results.csv', index=True)
print("Results saved to 'training_results.csv'")

Results saved to 'training_results.csv'
