In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 7.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 26.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [2]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 6.8 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('drive/MyDrive/William_2022/DATASETS/trans_encoder_2')

In [None]:
!python3 DCCA.py

# MultiModal

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from model_new import Transformer, Transformer2
from config import *
from imblearn.over_sampling import RandomOverSampler
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import tqdm
from tqdm import tqdm
from optim_new import ScheduledOptim
from torch.optim import Adam
from sklearn.metrics import confusion_matrix
from CCA import DeepCCA, cca_loss

In [6]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

class EEGDataset(Dataset):
    def __init__(self, signal, label):

        self._signal = torch.FloatTensor(signal)
        self._label = torch.LongTensor(label)


    @property
    def n_insts(self):
        ''' Property for dataset size '''
        return len(self._label)

    @property
    def sig_len(self):
        return self._signal.shape[1]

    def __len__(self):
        return self.n_insts

    def __getitem__(self, idx):
        return self._signal[idx], self._label[idx]

class TextDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  @property
  def n_insts(self):
    return len(self.labels)

  @property
  def text_len(self):
    return 32

  def __len__(self):
    return self.n_insts

  def __getitem__(self, item):
    text = str(self.texts[item])
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length = True,
      return_attention_mask=True
      #return_tensors='pt',
    )
    return torch.FloatTensor(encoding['input_ids']).flatten(), torch.tensor(label, dtype=torch.long)



Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [7]:
# --- Preprocess
df = pd.read_csv('df.csv')

X = df.drop([emotion], axis = 1)
y= df[[emotion]]

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 2, test_size = 0.3)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, random_state= 2, test_size = 0.5)
df_test = pd.concat([X_test, y_test], axis = 1)
df_train = pd.concat([X_train, y_train], axis = 1)
df_val = pd.concat([X_val, y_val], axis = 1)

df_train_text = df_train[[emotion, 'new_words']]
df_train_eeg = df_train[eeg]

df_val_text = df_val[[emotion, 'new_words']]
df_val_eeg = df_val[eeg]

df_test_text = df_test[[emotion, 'new_words']]
df_test_eeg = df_test[eeg]

# --- Save CSV
df_train_text.to_csv('df_train_text.csv', header = None, index = False, index_label = False)
df_train_eeg.to_csv('df_train_eeg.csv', header = None, index = False, index_label = False)

df_val_text.to_csv('df_val_text.csv', header = None, index = False, index_label = False)
df_val_eeg.to_csv('df_val_eeg.csv', header = None, index = False, index_label=False)


df_test_text.to_csv('df_test_text.csv', header = None, index = False, index_label = False)
df_test_eeg.to_csv('df_test_eeg.csv', header = None, index = False, index_label=False)

# --- Load CSV
df_train_text = pd.read_csv('df_train_text.csv', header = None).values
df_train_eeg = pd.read_csv('df_train_eeg.csv', header = None).values

df_val_text= pd.read_csv('df_val_text.csv', header = None).values
df_val_eeg = pd.read_csv('df_val_eeg.csv', header = None).values

df_test_text= pd.read_csv('df_test_text.csv', header = None).values
df_test_eeg = pd.read_csv('df_test_eeg.csv', header = None).values

# --- RandomOverSampling
# Text

X_text_train = df_train_text[:, 1:]
y_text_train = df_train_text[:, 0]
y_text_train = y_text_train.astype('int')

ros = RandomOverSampler(random_state=2)
X_resampled_text, y_resampled_text = ros.fit_resample(X_text_train, y_text_train)

# print(df_)
# EEG

X_eeg_train = df_train_eeg[:, 1:]
y_eeg_train = df_train_eeg[:, 0]

ros = RandomOverSampler(random_state=2)
X_resampled_eeg, y_resampled_eeg = ros.fit_resample(X_eeg_train, y_eeg_train)



# --- Text
train_text = TextDataset(
    texts = X_resampled_text,
    labels = y_resampled_text,
    tokenizer = tokenizer,
    max_len = MAX_LEN
)
val_text = TextDataset(
    texts = df_val_text[:, 1:],
    labels = df_val_text[:, 0],
    tokenizer = tokenizer,
    max_len = MAX_LEN
)

test_text = TextDataset(
  texts = df_test_text[:, 1:],
  labels = df_test_text[:, 0],
  tokenizer = tokenizer,
  max_len = MAX_LEN

)
train_loader_text = DataLoader(dataset=train_text,
                          batch_size=batch_size,
                          num_workers=2
                          )
valid_loader_text = DataLoader(dataset=val_text,
                          batch_size=batch_size,
                          num_workers=2
                          )
test_loader_text = DataLoader(dataset=test_text,
                          batch_size=batch_size,
                          num_workers=2
                          )
# --- EEG
train_eeg = EEGDataset(
    signal = X_resampled_eeg,
    label = y_resampled_eeg
)

val_eeg = EEGDataset(
    signal = df_val_eeg[:, 1:],
    label = df_val_eeg[:, 0]
)

test_eeg = EEGDataset(
  signal = df_test_eeg[:, 1:],
  label = df_test_eeg[:, 0]
)
# --- Dataloader EEG
train_loader_eeg = DataLoader(dataset=train_eeg,
                          batch_size=batch_size,
                          num_workers=2
                          )
valid_loader_eeg = DataLoader(dataset=val_eeg,
                          batch_size=batch_size,
                          num_workers=2)

test_loader_eeg = DataLoader(dataset=test_eeg,
                          batch_size=batch_size,
                          num_workers=2)

In [8]:
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

In [9]:
model1 = Transformer(device = device, d_feature=train_text.text_len, d_model=d_model, d_inner=d_inner,
                            n_layers=num_layers, n_head=num_heads, d_k=64, d_v=64, dropout=dropout,
                            class_num=class_num)

In [10]:
model2 = Transformer2(device=device, d_feature=train_eeg.sig_len, d_model=d_model, d_inner=d_inner,
                            n_layers=num_layers, n_head=num_heads, d_k=64, d_v=64, dropout=dropout, class_num=class_num)

In [11]:
model1 = nn.DataParallel(model1)
model2 = nn.DataParallel(model2)

In [12]:
use_all_singular_values = False
outdim_size = class_num

In [13]:
model = DeepCCA(model1, model2, outdim_size, use_all_singular_values)

In [14]:
chkpt3 = torch.load('0angry2_trans_baseline_DCCA_transform.chkpt', map_location = 'cpu')
model.load_state_dict(chkpt3['model'])


<All keys matched successfully>

In [17]:
def cal_loss(pred, label, device):

    cnt_per_class = np.zeros(2)

    loss = cca_loss(outdim_size, use_all_singular_values, device).loss
    pred = pred.max(1)[1]
    n_correct = pred.eq(label).sum().item()
    cnt_per_class = [cnt_per_class[j] + pred.eq(j).sum().item() for j in range(class_num)]
    return loss, n_correct, cnt_per_class

In [18]:
optimizer = ScheduledOptim(
            Adam(filter(lambda x: x.requires_grad, model1.parameters()),
                 betas=(0.9, 0.98), eps=1e-4), d_model, warm_steps)
        
optimizer2 = ScheduledOptim(
            Adam(filter(lambda x: x.requires_grad, model2.parameters()),
                 betas=(0.9, 0.98), eps=1e-4), d_model, warm_steps)

In [20]:
all_labels = []
all_res = []
model.train()
total_loss = 0
total_correct = 0
total_num = train_text.__len__()
total_num2 = train_eeg.__len__()
#cnt_per_class = np.zeros(class_num)


dataloader_iterator = iter(train_loader_text)

for i, data1 in enumerate(train_loader_eeg):

  try:
      data2 = next(dataloader_iterator)
  except StopIteration:
      dataloader_iterator = iter(train_loader_text)
      data2 = next(dataloader_iterator)

  sig1, label1 = map(lambda x: x.to(device), data2)
  sig2, _ = map(lambda x: x.to(device), data1)


  
  optimizer.zero_grad()
  pred1, pred2 = model(sig1, sig2)
  
  print(pred1)
  print(pred2)
  break
  # all_labels.extend(label1.cpu().numpy())
  # all_res.extend(pred1.max(1)[1].cpu().numpy())
  # loss, n_correct1, cnt1 = cal_loss(pred1, label1, device)
  loss = model.loss
  loss = loss(pred1, pred2)
  print(loss)
  # print(loss)
  loss.backward()
  optimizer.step_and_update_lr()
  total_loss += loss.item()
  # total_correct += (n_correct1)

#cnt_per_class += (cnt1 + cnt2)

  cm = confusion_matrix(all_labels, all_res)
  
sig1 = sig1
sig2 = sig2
train_loss = total_loss / (total_num + total_num2)
# train_acc = total_correct / (total_num + total_num2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tensor([[-3.2822,  5.1362],
        [-1.9806, -8.0111],
        [-3.4370, -3.8603],
        [ 4.9608, -7.7350],
        [-4.0438, -3.5782],
        [ 1.4583,  6.6048],
        [ 0.5142, -0.9722],
        [-2.3586,  6.1767]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-0.1828,  9.6427],
        [ 2.7336, 12.9284],
        [ 0.9778, 10.8378],
        [ 0.9158,  9.4692],
        [ 0.4634, 11.2640],
        [ 2.9747, 10.0802],
        [ 1.6537, 11.2689],
        [ 0.5201, 12.2148]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [None]:
'''
H1 = text
H2 = eeg

sig1 = text
sig2 = eeg

o1 = o2 = size of text

m = text row length

H1bar = at each row take the mean
'''
r1= 1e-3
r2 = 1e-3
eps = 1e-9

H1, H2 = sig1.t(), sig2.t()

o1 = H1.size(0)
o2 = H2.size(0)
m = H1.size(1)

H1bar = H1 - H1.mean(dim = 1).unsqueeze(dim=1)
H2bar = H2 - H2.mean(dim =1).unsqueeze(dim=1)

SigmaHat12 = (1.0/(m-1)) * torch.matmul(H1bar, H2bar.t())
SigmaHat11 = (1.0 / (m-1)) * torch.matmul(H1bar, H1bar.t()) + r1 * torch.eye(o1, device = device)
SigmaHat22 = (1.0 / (m-1)) * torch.matmul(H2bar, H2bar.t()) + r2 * torch.eye(o2, device = device)


[D1, V1] = torch.symeig(SigmaHat11, eigenvectors = True)
[D2, V2] = torch.symeig(SigmaHat22, eigenvectors = True)

posInd1 = torch.gt(D1, eps).nonzero()[:, 0]
D1 = D1[posInd1]
V1 = V1[:, posInd1]
posInd2 = torch.gt(D2, eps).nonzero()[:, 0]
D2 = D2[posInd2]
V2 = V2[:, posInd2]



SigmaHat11RootInv = torch.matmul(
    torch.matmul(V1, torch.diag(D1 ** -0.5)), V1.t())
SigmaHat22RootInv = torch.matmul(
    torch.matmul(V2, torch.diag(D2 ** -0.5)), V2.t())

Tval = torch.matmul(torch.matmul(SigmaHat11RootInv,
                                  SigmaHat12), SigmaHat22RootInv)


trace_TT = torch.matmul(Tval.t(), Tval)
trace_TT = torch.add(trace_TT, (torch.eye(trace_TT.shape[0])*r1).to(device)) # regularization for more stability
U, V = torch.symeig(trace_TT, eigenvectors=True)
U = torch.where(U>eps, U, (torch.ones(U.shape).float()*eps).to(device))
U = U.topk(outdim_size)[0]
corr = torch.sum(torch.sqrt(U))

In [None]:
corr

tensor(1.9993)