<a href="https://colab.research.google.com/github/qiuhuasheng1107/project_pytorch_exercise/blob/main/bench_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import random
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
from matplotlib.pyplot import figure
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split
from sklearn.preprocessing import MinMaxScaler
import math
from imblearn.over_sampling import RandomOverSampler 
from collections import Counter
torch.manual_seed(0)

<torch._C.Generator at 0x7f37331019b0>

In [3]:
df = pd.read_csv('deep_with_duration.csv')
print(df.shape)
print(df.columns.values)
df = df[df['cb_list'].notna()]
print('*******')
print("size of data:", len(df))

(124853, 23)
['cb_list' 'date_list' 'fst_hipday' 'age' 'sex' 'ALENDRONATE' 'CALCITONIN'
 'CALCITONIN_I' 'DENOSUMAB' 'DENOSUMAB_C' 'ETIDRONATE' 'IBANDRONIC'
 'PAMIDRONATE' 'RALOXIFENE' 'TERIPARATIDE' 'ZOLEDRONIC' 'ZOLEDRONIC_O'
 'cindate' 'before_cb' 'before_date' 'psd_date' 'dur_list' 'class']
*******
size of data: 124853


In [4]:
# label to index & y
label_to_ix = {"first_snd": 0, "first_die": 1, "good": 2, "snd_die":3}

y = []
for i in range(0, len(df)):
    lab = df.iloc[i, 22]
    y.append(label_to_ix[lab])

In [5]:
# cb to index
cb_column = df[['before_cb']]
cb_to_ix = {}

for i in range(0, len(df)):
    cbs = cb_column.iloc[i, 0].split(', ')
    for cb in cbs:
        if cb not in cb_to_ix:  # illness has not been assigned an index yet
            cb_to_ix[cb] = len(cb_to_ix)
print(cb_to_ix)
print(len(cb_to_ix))

{'Glaucomadate': 0, 'Cataractdate': 1, 'Chronicplumonarydiseasedate': 2, 'Congestiveheartfailuredate': 3, 'DMwithoutcomplicationsdate': 4, 'Dementiadate': 5, 'Mildliverdiseasedate': 6, 'Malignantneoplasmsdate': 7, 'Pepticulcerdiseasedate': 8, 'Metastaticsolidtumordate': 9, 'Cerebrovasculardiseasedate': 10, 'Renaldiseasedate': 11, 'DMwithcomplicationsdate': 12, 'Rheumatologicdiseasedate': 13, 'Peripheralvasculardiseasedate': 14, 'Keratitisdate': 15, 'Myocardialinfarctiondate': 16, 'Mosliverdiseasedate': 17, 'Hemiparaplegiadate': 18, 'Cushingdate': 19, 'Thyrotoxicosisdate': 20, 'AgerelatedMDdate': 21, 'Hyperparathyroidismdate': 22, 'Pagetdate': 23, 'Aidsdate': 24}
25


In [6]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return idxs

# padding the input and construct feature set
X = []
for i in range (0, len(cb_column)):
    seq = cb_column.iloc[i, 0].split(', ')
    res = prepare_sequence(seq, cb_to_ix)
    X.append(res)
print(X[0:5])

X_lengths = []
for i in range(0, len(cb_column)):
    seq = cb_column.iloc[i, 0].split(', ')
    seq_len = len(seq)
    X_lengths.append(seq_len)

longest_cbs = max(X_lengths)
print(longest_cbs)
rows = len(cb_column)
padded_X = np.zeros((rows, longest_cbs))

for i, x_len in enumerate(X_lengths):
  sequence = X[i]
  padded_X[i, longest_cbs-x_len:] = sequence[:x_len]
print(padded_X[0:5])

[[0, 1, 2, 3, 4, 5], [2, 4], [6], [7, 8, 9, 6], [1]]
16
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 2. 3. 4. 5.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 4.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 6.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 7. 8. 9. 6.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [7]:
# construct the rest feature set
diag = df[['age', 'sex']]
diag['sex'] = diag['sex'].map({'F': 1, 'M': 0})
drug = df.loc[:, 'ALENDRONATE':'ZOLEDRONIC_O']
features = pd.concat([diag, drug], axis=1)
features = features.values.astype(float)
print('feature:', features.shape)

feature: (124853, 14)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
# construct training, validation, and testing dataset
x_train01, x_test01, y_train01, y_test01 = train_test_split(padded_X, y,test_size=0.1, random_state=42)
x_train02, x_test02, y_train02, y_test02 = train_test_split(features, y,test_size=0.1, random_state=42)
print(y_test01 == y_test02)

True


In [9]:
under_sampler = RandomOverSampler(random_state=0)
x_train01_sp, y_train01_sp = under_sampler.fit_resample(x_train01, y_train01)
x_train02_sp, y_train02_sp = under_sampler.fit_resample(x_train02, y_train02)
print(y_train01_sp == y_train02_sp)
Counter(y_train01_sp)

True


Counter({0: 60849, 1: 60849, 2: 60849, 3: 60849})

In [10]:
# input 1
inputs_1 = torch.from_numpy(x_train01_sp).type(torch.int)
targets_1 = torch.from_numpy(np.array(y_train01_sp)).type(torch.LongTensor)

# input 2
inputs_2 = torch.from_numpy(x_train02_sp).type(torch.float)
targets_2 = torch.from_numpy(np.array(y_train02_sp)).type(torch.LongTensor)

In [11]:
# construct training and validation dataset
train_val = TensorDataset(inputs_1, inputs_2, targets_1)
val_size = 6000
train_size = len(train_val) - val_size

In [12]:
train_ds, val_ds = random_split(train_val, [train_size, val_size])
batch_size = 128
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)

In [13]:
# testing dataset
inputs_t1 = torch.from_numpy(x_test01).type(torch.int)
targets_t1 = torch.from_numpy(np.array(y_test01)).type(torch.LongTensor)

inputs_t2 = torch.from_numpy(x_test02).type(torch.float)
targets_t2 = torch.from_numpy(np.array(y_test02)).type(torch.LongTensor)

test = TensorDataset(inputs_t1, inputs_t2, targets_t1)

In [25]:
from torch.nn.modules import linear
# model construction
class mymodel(nn.Module):
    def __init__(self, embedding_dim, cbs_size, output): 
      super(mymodel, self).__init__()
      self.cb_embeddings = nn.Embedding(cbs_size, embedding_dim)
      self.att_ly = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=2, batch_first=True)
      self.linear = nn.Linear(embedding_dim*16+14, output)

    def forward(self, seq, fts, embedding_dim):
      embeds = self.cb_embeddings(seq)
      out,_ = self.att_ly(embeds, embeds, embeds)
      out = torch.reshape(out, (out.shape[0], -1))
      out = torch.cat((out, fts), dim=1)
      out = self.linear(out)
      return torch.log_softmax(out, dim=1)

In [26]:
embedding_dim = 6
model = mymodel(embedding_dim, len(cb_to_ix), len(label_to_ix))

In [27]:
loss_fn = torch.nn.NLLLoss()
opt = torch.optim.Adam(model.parameters())

In [28]:
def accuracy(outputs, labels):
  _, preds = torch.max(outputs, dim=1)
  return torch.sum(preds == labels).item()/len(preds)

In [29]:
def loss_batch(model, loss_fn, x1, x2, yb, opt=None, metric=None):
    preds = model(x1, x2, embedding_dim)
    loss = loss_fn(preds, yb)
    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
    metric_result = None
    if metric is not None:
        metric_result = metric(preds, yb)
    return loss.item(), len(x1), metric_result

In [30]:
def evaluate(model, loss_fn, val_dl, metric=None):
    with torch.no_grad():
        results = [loss_batch(model, loss_fn, x1, x2, yb, metric=metric) for x1, x2, yb in val_dl]
        losses, nums, metrics = zip(*results)
        total = np.sum(nums)
        avg_loss = np.sum(np.multiply(losses, nums))/total
        avg_metric = None
        if metric is not None:
            avg_metric=np.sum(np.multiply(metrics, nums))/total
    return avg_loss, total, avg_metric

In [31]:
val_loss, total, acc = evaluate(model, loss_fn, val_dl, metric=accuracy)
print(val_loss, total, acc)

4.258766297658284 6000 0.23783333333333334


In [32]:
def fit(epochs, model, loss_fn, opt, train_dl, val_dl, metric=None):
    train_losses, val_losses, accs = [], [], []
    for epoch in range(epochs):
        model.train()
        for x1, x2, yb in train_dl:
            train_loss, _,_ = loss_batch(model, loss_fn, x1, x2, yb, opt)
        model.eval()
        result = evaluate(model, loss_fn, val_dl, metric)
        val_loss, total, val_metric = result

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        accs.append(val_metric)
        if (epoch + 1) % 10 == 0:
            print(epoch + 1, epochs, val_loss, val_metric)
    return train_losses, val_losses, accs

In [33]:
train_losses, val_losses, accy = fit(10, model, loss_fn, opt, train_dl, val_dl, accuracy)

10 10 1.3218831599553427 0.3565


In [34]:
def predict_x(x1, x2, model):
  yb = model(x1, x2, 6)
  _,preds = torch.max(yb, dim=1)
  return preds.numpy()

In [35]:
x1, x2, label = test[:]
print(label.numpy(), 'predicted', predict_x(x1, x2, model))
y_pred = predict_x(x1, x2, model)
print(classification_report(label, y_pred))

[1 2 1 ... 1 1 2] predicted [2 0 2 ... 1 1 1]
              precision    recall  f1-score   support

           0       0.08      0.18      0.11       637
           1       0.70      0.53      0.60      6809
           2       0.51      0.48      0.49      4398
           3       0.09      0.24      0.13       642

    accuracy                           0.48     12486
   macro avg       0.34      0.36      0.33     12486
weighted avg       0.57      0.48      0.51     12486

