In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import torch
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from Scripts.Configs.ConfigClass import Config
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
import networkx as nx
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.dataset import random_split, T_co
from torch_geometric.data.lightning import LightningDataset
import pdb
import lightning as L
import time
from Scripts.DataManager.GraphConstructor.CoOccurrenceGraphConstructor import CoOccurrenceGraphConstructor
from Scripts.DataManager.GraphLoader.GLabeledGraphLoader import GLabeledGraphLoader

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
config = Config(r'C:\Users\fardin\Projects\ColorIntelligence\Scripts\Configs\Config.json')



In [2]:
train_df = pd.read_csv(r'C:\Users\fardin\Projects\ColorIntelligence\data\Amazon-Review\train_sm.csv')
test_df = pd.read_csv(r'C:\Users\fardin\Projects\ColorIntelligence\data\Amazon-Review\test_sm.csv')
train_df.columns = ['Polarity', 'Title', 'Review']
test_df.columns = ['Polarity', 'Title', 'Review']
train_df = train_df[['Polarity', 'Review']]
test_df = test_df[['Polarity', 'Review']]

In [3]:
config = Config(r'C:\Users\fardin\Projects\ColorIntelligence\Scripts\Configs\Config.json')

In [4]:
start_time = time.time()
graph_const = CoOccurrenceGraphConstructor(train_df['Review'], 'AmazonReview', config, lazy_construction=True,  load_preprocessed_data=True, naming_prepend='graph')
print(f'execution time in second: {time.time() - start_time}')
# graph_const = CoOccurrenceGraphConstructor(train_df['Review'][:10], 'AmazonReview', config, lazy_construction=False, naming_prepend='graph', load_preprocessed_data=False)
# graph = graph_const.to_graph(train_df['Review'][0])
# graph_const.draw_graph(0)

execution time in second: 0.008000612258911133


In [37]:
labels = torch.tensor(test_df['Polarity'].apply(lambda p: 0 if p==1 else 1), dtype=torch.float32).view(-1,1)
graph_loader = GLabeledGraphLoader(graph_const, labels[:10],2, 'cpu', val_size=0.2)

In [38]:
from typing import Any

import torch
import torch.nn.functional as F
from pytorch_lightning.utilities.types import OptimizerLRScheduler, STEP_OUTPUT

from torch_geometric.nn import summary
from tqdm import tqdm

from Scripts.Models.ModelsManager.ModelManager import ModelManager
from Scripts.Models.ClassifierModels.GATGCNClassifierSimple import GNNClassifier
from Scripts.DataManager.GraphLoader.NLabeledGraphLoader import NLabeledGraphLoader
from Scripts.Utils.enums import Optimizer, LossType

import lightning as L


from torch_geometric.nn import GATv2Conv, GCNConv, GCN2Conv, DenseGCNConv
from torch_geometric.nn.dense.diff_pool import dense_diff_pool
from torch_geometric.nn import Sequential as GSequential
from torch import nn
from torch_geometric.nn.dense.diff_pool import dense_diff_pool
from torch_geometric.data import batch
from torch_geometric.utils import to_dense_adj

In [39]:

from torch.nn import Linear


class GraphAutoEncoderModel(nn.Module):

    def __init__(self, input_feature: int, out_features: int, dropout=0.1, *args, **kwargs):
        super(GraphAutoEncoderModel, self).__init__(*args, **kwargs)
        self.input_features = input_feature
        self.num_out_features = out_features
        self.encoder = GSequential('x, edge_index, edge_weights', [
            (GCNConv(input_feature, 256), 'x, edge_index, edge_weights ->x1'),
            (nn.ReLU(), 'x1->x1'),
            (GCNConv(256, 128), 'x1, edge_index, edge_weights -> x2'),
            (nn.ReLU(), 'x2->x2'),
            (GCNConv(128, 64), 'x2, edge_index, edge_weights -> x3'),
            (nn.ReLU(), 'x3->x3'),
            (GCNConv(64, 32), 'x3, edge_index, edge_weights -> x3'),
            (nn.ReLU(), 'x3->x3'),
            (GATv2Conv(32, 32, 4, dropout=dropout), 'x3, edge_index ->x3'),
            # (GATv2Conv(128, 64, 2, dropout=dropout), 'x2, edge_index->x2'),
            (nn.ReLU(), 'x3->x3'),
            (GCN2Conv(128, 0.5, 0.1, 2), 'x3, x2, edge_index, edge_weights->x3'),
            (nn.ReLU(), 'x3->x3'),
            (GCNConv(128, 256), 'x3, edge_index->x3'),
            (nn.ReLU(), 'x3->x3'),
            (GCN2Conv(256, 0.5, 0.1, 2), 'x3, x1, edge_index, edge_weights->x3'),
            (nn.ReLU(), 'x3->x3')
        ])

        self.pooling_layer1 = GCNConv(256, 5)
        self.pooling_layer2 = DenseGCNConv(256, 1)

        # self.output_layer = GCNConv(256, self.num_out_features)
        self.output_layer = Linear(256, self.num_out_features)

    def forward(self, X):
        ci = torch.tensor([X[i].x.shape[0] for i in range(len(X))], dtype=torch.int).cumsum(0, dtype=torch.int)
        x1 = self.encoder(X.x, X.edge_index, X.edge_attr)
        x2 = [x1[0 if i==0 else ci[i - 1]:ci[i]] for i in range(len(ci))]
        x3 = torch.zeros((len(x2), 256), dtype = x1.dtype, device = x1.device)
        for i in range(len(ci)):
            s = self.pooling_layer1(x2[i], X[i].edge_index, X[i].edge_attr)
            # adj = to_dense_adj(edge_index=data_batch[i].edge_index, edge_attr=data_batch[i].edge_attr)
            adj = torch.sparse_coo_tensor(X[i].edge_index, X[i].edge_attr).to_dense()
            nodes, adj, _, _ = dense_diff_pool(x2[i], adj, s=s)
            s = self.pooling_layer2(nodes, adj)
            nodes, _, _, _ = dense_diff_pool(nodes, adj, s=s)
            x3[i] = torch.squeeze(nodes)

        return self.output_layer(x3)

In [40]:
train_data_loader = graph_loader.get_train_data()
X, y = next(iter(train_data_loader))

In [41]:
autoencoder_model = GraphAutoEncoderModel(300, 1)
autoencoder_model(X)

Exception ignored in: <function _ConnectionBase.__del__ at 0x000001E2199755A0>
Traceback (most recent call last):
  File "C:\Users\fardin\AppData\Local\Programs\Python\Python310\lib\multiprocessing\connection.py", line 132, in __del__
    self._close()
  File "C:\Users\fardin\AppData\Local\Programs\Python\Python310\lib\multiprocessing\connection.py", line 277, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid


tensor([[ 5.6575],
        [13.2368]], grad_fn=<AddmmBackward0>)

In [44]:
from typing import Literal
import torchmetrics


class LightningModel(L.LightningModule):

    def __init__(self, model, optimizer, loss_func):
        super(LightningModel, self).__init__()
        self.optimizer = optimizer
        self.model = model
        self.loss_func = loss_func
        if model.num_out_features > 2:
            self.train_acc = torchmetrics.Accuracy(task="multiclass")
            self.val_acc = torchmetrics.Accuracy(task="multiclass")
            self.test_acc = torchmetrics.Accuracy(task="multiclass")
        else:
            self.train_acc = torchmetrics.Accuracy(task="binary")
            self.val_acc = torchmetrics.Accuracy(task="binary")
            self.test_acc = torchmetrics.Accuracy(task="binary")


    def forward(self, data_batch, *args, **kwargs):
        return self.model(data_batch)

    def training_step(self, data_batch, *args, **kwargs) :
        data, labels = data_batch
        logits = self(data)
        loss = self.loss_func(logits, labels.view(logits.shape))
        self.log('training_loss', loss)

        predicted_labels = logits if logits.shape[1] < 2 else torch.argmax(logits, dim=1)
        self.train_acc(predicted_labels, labels.view(predicted_labels.shape))
        self.log('training_acc', self.train_acc, prog_bar=True, on_epoch=True, on_step=False)

        return loss

    def validation_step(self, data_batch, *args, **kwargs):
        data, labels = data_batch
        logits = self(data)
        loss = self.loss_func(logits, labels.view(logits.shape))
        self.log('val_loss', loss)

        predicted_labels = logits if logits.shape[1] < 2 else torch.argmax(logits, dim=1)
        self.val_acc(predicted_labels, labels.view(predicted_labels.shape))
        self.log('val_acc', self.val_acc, prog_bar=True, on_epoch=True, on_step=False)


    # def test_step(self, data_batch, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
    #     data, labels = data_batch
    #     pred_labels = self(data)
    #     loss = self.loss_func(pred_labels, labels)
    #     self.log('test_loss', loss)

    def predict_step(self, data_batch, *args: Any, **kwargs: Any) -> Any:
        data, labels = data_batch
        return self(data)

    def configure_optimizers(self) -> OptimizerLRScheduler:
        return self.optimizer

In [45]:
autoencoder_model = GraphAutoEncoderModel(300, 1)
lightning_model = LightningModel(autoencoder_model,
                                 torch.optim.Adam(autoencoder_model.parameters(), lr=0.001, weight_decay=0.005), nn.BCEWithLogitsLoss())

In [47]:
trainer = L.Trainer(max_epochs=10, accelerator='gpu', devices=1, num_sanity_val_steps=0)
trainer.fit(lightning_model, graph_loader.get_train_data(), graph_loader.get_val_data())

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                  | Params
----------------------------------------------------
0 | model     | GraphAutoEncoderModel | 245 K 
1 | loss_func | BCEWithLogitsLoss     | 0     
2 | train_acc | BinaryAccuracy        | 0     
3 | val_acc   | BinaryAccuracy        | 0     
4 | test_acc  | BinaryAccuracy        | 0     
----------------------------------------------------
245 K     Trainable params
0         Non-trainable pa

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
trainer.

In [29]:
train_data = graph_loader.get_train_data()

In [54]:
X, y = next(iter(train_data))
module_0 = GCNConv(300, 256)
output = module_0(X.x, X.edge_index, X.edge_attr)

In [57]:
output2 = [output[:X[0].x.shape[0]], output[X[0].x.shape[0]:X[1].x.shape[0]], output[X[1].x.shape[0]:X[2].x.shape[0]]]

In [140]:
import torch_geometric

In [142]:
print(X is torch_geometric.data.batch.Batch)

False


In [70]:
gcn_conv = GCNConv(256, 64)

In [95]:
# s = gcn_conv(output2[0], X[0].edge_index)
# adj = torch.sparse_coo_tensor(X[0].edge_index, X[0].edge_attr).to_dense()
# output3 = dense_diff_pool(output2[0], adj, s=s)