#### Omid Davar @2023

In [1]:
import pandas as pd
import spacy
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import lightning as pl
from torch.utils.data import Dataset, DataLoader
from spacy.lang.en import English
from tqdm import tqdm
import torchmetrics

nlp = spacy.load("en_core_web_lg")

class MulticlassTextDataset(Dataset):
        
    def __init__(self , load_embeddings = True):
        self.load_embeddings = load_embeddings
        self.end_data_load = -1
        self.start_data_load = 0
        self.train_df = pd.read_csv(r'E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\data\AG\train.csv')
        self.test_df = pd.read_csv(r'E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\data\AG\test.csv')
        self.train_df.columns = ['Class', 'Title', 'Description']
        self.test_df.columns = ['Class', 'Title', 'Description']
        self.train_df['Description'] = self.train_df['Title'].astype(str) + ' ' +  self.train_df['Description'].astype(str)
        self.test_df['Description'] = self.test_df['Title'].astype(str) + ' ' +  self.test_df['Description'].astype(str)
        self.train_df = self.train_df[['Class', 'Description']]
        self.test_df = self.test_df[['Class', 'Description']]
        self.data = pd.concat([self.train_df, self.test_df])
        self.end_data_load = self.end_data_load if self.end_data_load>0 else self.data.shape[0]
        self.end_data_load = self.end_data_load if self.end_data_load < self.data.shape[0] else self.data.shape[0] 
        self.data = self.data.iloc[:self.end_data_load]
        self.data.index = np.arange(0, self.end_data_load)
        # activate one line below
        self.data['Class'] -= 1
        labels = self.data['Class'][self.start_data_load:self.end_data_load]
        labels = labels.to_numpy()
        labels = torch.from_numpy(labels)
        
        self.labels = torch.squeeze(labels.to(torch.float32).view(-1, 1).to('cpu') , 1)
        if self.load_embeddings:
            self.embeddings_tensor =  torch.tensor(torch.load(r'E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\data\AG\embeddings.pt'))
        else:
            embeddings = []
            for text in tqdm(self.data['Description'] , 'Creating Embeddings ...'):
                doc = nlp(text)
                embedding = doc.vector
                embeddings.append(embedding)
            torch.save(embeddings , r'E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\data\AG\embeddings.pt')
            self.data['embedding'] = embeddings
            self.embeddings_tensor = embeddings
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return [self.embeddings_tensor[idx], self.labels[idx]]


class MulticlassSpacyClassifierModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MulticlassSpacyClassifierModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


class MulticlassSpacyTextClassifier(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim , batch_size=256):
        super(MulticlassSpacyTextClassifier, self).__init__()
        self.model = MulticlassSpacyClassifierModel(input_dim, hidden_dim, output_dim)
        self.loss_fn = nn.CrossEntropyLoss()
        self.batch_size = batch_size
        self.train_acc = torchmetrics.Accuracy(task="multiclass" , num_classes=output_dim)
        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=output_dim)
        self.lr = 1e-3

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x = batch[0]
        y = batch[1]
        y_pred = self.model(x)
        
        y_pred = y_pred.float()
        y = y.long()

        loss = self.loss_fn(y_pred, y)
        self.train_acc(torch.squeeze(torch.argmax(y_pred, dim=1)), y)
        self.log('train_acc', self.train_acc, prog_bar=True, on_epoch=True, on_step=True, batch_size=self.batch_size)
        return loss

    def validation_step(self, batch, *args, **kwargs):
        x = batch[0]
        y = batch[1]
        y_pred = self.model(x)
        # Cast the model outputs and target labels to the appropriate data types
        y_pred = y_pred.float()  # Ensure y_pred is of type torch.float
        y = y.long()  # Ensure y is of type torch.long
        self.val_acc(torch.squeeze(torch.argmax(y_pred, dim=1)), y)
        self.log('val_acc', self.val_acc, prog_bar=True, on_epoch=True, on_step=True, batch_size=self.batch_size)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr)


input_dim = 300  # SpaCy embeddings dimension
hidden_dim = 100
output_dim = 4 
trainer = pl.Trainer(max_epochs=70)


multiclass_dataset = MulticlassTextDataset()



train_size = int(0.8 * len(multiclass_dataset))
val_size = int(0.15 * len(multiclass_dataset))
test_size = len(multiclass_dataset) - (train_size + val_size)
train_dataset, val_dataset , test_dataset = torch.utils.data.random_split(multiclass_dataset, [train_size, val_size,test_size])


train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=256 , shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=256 , shuffle=False)


multiclass_model = MulticlassSpacyTextClassifier(input_dim, hidden_dim, output_dim , batch_size=256)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\Omid\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
  self.embeddings_tensor =  torch.tensor(torch.load(r'E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\data\AG\embeddings.pt'))


In [2]:
import torch
from Scripts.Models.LightningModels.LightningModels import BaseLightningModel
from Scripts.Models.ModelsManager.ModelManager import ModelManager
import pandas as pd
import matplotlib.pyplot as plt
from typing import List
from torch_geometric.nn import summary
from lightning.pytorch.callbacks import Callback, ModelCheckpoint, EarlyStopping
from os import path

from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, hinge_loss

class ClassifierModelManager(ModelManager):

    def __init__(self,
                 torch_model: torch.nn.Module,
                 lightning_model: BaseLightningModel,
                 model_save_dir: str = '~/Desktop',
                 log_dir: str = 'logs/',
                 log_name: str = 'model_logs',
                 device='cpu',
                 num_train_epoch = 100):
        super(ClassifierModelManager, self).__init__(torch_model, lightning_model, model_save_dir, log_dir, log_name, device, num_train_epoch)

    def _create_callbacks(self) -> List[Callback]:
        return [
            ModelCheckpoint(save_top_k=2, mode='max', monitor='val_acc', save_last=True),
            # EarlyStopping(patience=50, mode='max', monitor='val_acc')
        ]

    def draw_summary(self, dataloader):
        X, y = next(iter(dataloader))
        print(summary(self.torch_model, X.to(self.device)))

    def plot_csv_logger(self, loss_names=['train_loss', 'val_loss'], eval_names=['train_acc', 'val_acc']):
        csv_path = path.join(self.log_dir, self.log_name, f'version_{self.logger.version}', 'metrics.csv')
        metrics = pd.read_csv(csv_path)

        aggregation_metrics = []
        agg_col = 'epoch'
        for i, dfg in metrics.groupby(agg_col):
            agg = dict(dfg.mean())
            agg[agg_col] = i
            aggregation_metrics.append(agg)

        df_metrics = pd.DataFrame(aggregation_metrics)
        df_metrics[loss_names].plot(grid=True, legend=True, xlabel='Epoch', ylabel='loss')
        df_metrics[eval_names].plot(grid=True, legend=True, xlabel='Epoch', ylabel='accuracy')
        plt.show()

    def save_plot_csv_logger(self, loss_names=['train_loss', 'val_loss'], eval_names=['train_acc', 'val_acc'], name_prepend: str=""):
        csv_path = path.join(self.log_dir, self.log_name, f'version_{self.logger.version}', 'metrics.csv')
        metrics = pd.read_csv(csv_path)

        aggregation_metrics = []
        agg_col = 'epoch'
        for i, dfg in metrics.groupby(agg_col):
            agg = dict(dfg.mean())
            agg[agg_col] = i
            aggregation_metrics.append(agg)

        df_metrics = pd.DataFrame(aggregation_metrics)
        # df_metrics[loss_names].plot(grid=True, legend=True, xlabel='Epoch', ylabel='loss')
        
        # loss_png = path.join(self.log_dir, self.log_name, f'version_{self.logger.version}', f'{name_prepend}_loss_metric.png')
        # plt.savefig(loss_png)
        
        df_metrics[eval_names].plot(grid=True, legend=True, xlabel='Epoch', ylabel='accuracy')
        
        acc_png = path.join(self.log_dir, self.log_name, f'version_{self.logger.version}', f'{name_prepend}_acc_metric.png')
        plt.savefig(acc_png)
        
        plt.close()
    
    def evaluate(self, eval_dataloader,
                 give_confusion_matrix: bool=True, 
                 give_report: bool=True, 
                 give_f1_score: bool=False, 
                 give_accuracy_score: bool=False, 
                 give_precision_score: bool=False, 
                 give_recall_score: bool=False, 
                 give_hinge_loss: bool=False):
        y_true = []
        y_pred = []
        self.lightning_model.eval()
        for X, y in eval_dataloader:
            y_p = self.torch_model(X.to(self.device))
            if type(y_p) is tuple:
                y_p = y_p[0]
            y_pred.append((y_p>0).to(torch.int32).detach().to(y.device))
            y_true.append(y.to(torch.int32))
        y_true = torch.concat(y_true)
        y_pred = torch.concat(y_pred)
        if(give_confusion_matrix):
            print(f'confusion_matrix: \n{confusion_matrix(y_true, y_pred)}')
        if(give_report):
            print(classification_report(y_true, y_pred))
        if(give_f1_score):
            print(f'f1_score: {f1_score(y_true, y_pred)}')
        if(give_accuracy_score):
            print(f'accuracy_score: {accuracy_score(y_true, y_pred)}')
        if(give_precision_score):
            print(f'precision_score: {precision_score(y_true, y_pred)}')
        if(give_recall_score):
            print(f'recall_score: {recall_score(y_true, y_pred)}')
        if(give_hinge_loss):
            print(f'hinge_loss: {hinge_loss(y_true, y_pred)}')
                
    
    def save_evaluation(self, eval_dataloader, name_prepend: str='',
                    give_confusion_matrix: bool=True, 
                    give_report: bool=True, 
                    give_f1_score: bool=False, 
                    give_accuracy_score: bool=False, 
                    give_precision_score: bool=False, 
                    give_recall_score: bool=False, 
                    give_hinge_loss: bool=False,
                    multi_class: bool=True
                    ):
            
            test_metrics_path = path.join(self.log_dir, self.log_name, f'version_{self.logger.version}', f'{name_prepend}_test_metrics.txt')
            
            y_true = []
            y_pred = []
            self.lightning_model.eval()
            self.lightning_model.model.eval()
            self.torch_model.eval()
            for X, y in eval_dataloader:
                self.trainer.model.eval()
                with torch.no_grad():
                    y_p = self.trainer.model(X.to(self.device))
                if type(y_p) is tuple:
                    y_p = y_p[0]
                
                if multi_class:
                    y_pred.append(y_p.detach().to(y.device))
                    y_true.append(y)
                else:
                    y_pred.append((y_p>0).to(torch.int32).detach().to(y.device))
                    y_true.append(y.to(torch.int32))
            y_true = torch.concat(y_true)
            y_pred = torch.concat(y_pred)
            if multi_class:
                y_true_num = y_true
                y_pred_num = torch.argmax(y_pred, dim=1)
            else:
                y_true_num = y_true
                y_pred_num = y_pred
            with open(test_metrics_path, 'at+') as f:
                if(give_confusion_matrix):
                    print(f'confusion_matrix: \n{confusion_matrix(y_true_num, y_pred_num)}', file=f)
                if(give_report):
                    print(classification_report(y_true_num, y_pred_num), file=f)
                if(give_f1_score):
                    if multi_class:
                        print(f'f1_score: {f1_score(y_true_num, y_pred_num, average=None)}', file=f)
                    else:
                        print(f'f1_score: {f1_score(y_true_num, y_pred_num)}', file=f)
                if(give_accuracy_score):
                    print(f'accuracy_score: {accuracy_score(y_true_num, y_pred_num)}', file=f)
                if(give_precision_score):
                    if multi_class:
                        print(f'f1_score: {precision_score(y_true_num, y_pred_num, average=None)}', file=f)
                    else:
                        print(f'f1_score: {precision_score(y_true_num, y_pred_num)}', file=f)
                if(give_recall_score):
                    if multi_class:
                        print(f'f1_score: {recall_score(y_true_num, y_pred_num, average=None)}', file=f)
                    else:
                        print(f'f1_score: {recall_score(y_true_num, y_pred_num)}', file=f)
                if(give_hinge_loss):
                    print(f'hinge_loss: {hinge_loss(y_true_num, y_pred)}', file=f)

In [6]:
model_manager = ClassifierModelManager(multiclass_model.model, multiclass_model, log_name='hetero_model_11', model_save_dir=r'E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\Practices\Tasks\HeterogeneousGraphs\hetero_model_7',device='cpu', num_train_epoch=20)
model_manager.fit(train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, ckpt_path="E:\\Darsi\\Payan Name Arshad\\Second Work\\ColorIntelligence2\\ColorIntelligence\\logs\\hetero_model_11\\version_0\\checkpoints\\epoch=20-step=8379.ckpt")
# model_manager.fit(train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\Omid\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Restoring states from the checkpoint path at E:\Darsi\Payan Name Arshad\Second Work\ColorIntelligence2\ColorIntelligence\logs\hetero_model_11\version_0\checkpoints\epoch=20-step=8379.ckpt
c:\Users\Omid\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:345: The dirpath has changed from 'logs/hetero_model_11\\version_0\\checkpoints' to 'logs/hetero_model_11\\version_1\\checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name      | Type                           | Params
-------------------------------------------------------------
0 | model     | MulticlassSpacyClassifierModel | 30.5 K
1 | loss_fn   | CrossEntrop

In [7]:
model_manager.save_plot_csv_logger(loss_names=['train_loss', 'val_loss'], eval_names=['train_acc_epoch', 'val_acc_epoch'], name_prepend='tests_baseline')
model_manager.torch_model = model_manager.torch_model.to('cpu')
model_manager.save_evaluation(test_dataloader, 'baseline',True, True, True, True, True, True, True)

In [5]:
test_size

6381