## Loading Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
ekg_denoised = pd.read_pickle('ekg_denoised.pkl')
ekg_denoised = ekg_denoised.sample(frac=1)

In [3]:
ekg_denoised.head()

Unnamed: 0,ICD9_CODE,TEXT
1602,4019,sinus rhythm. miniature r waves with r' patter...
1551,4019,sinus rhythm with one premaeture atrial beat. ...
3183,4280,sinus rhythm. the p-r interval is prolonged. t...
5425,42731,atrial fibrillation with rapid ventricular res...
62,4019,sinus tachycardia with diffuse non-diagnostic ...


---

## Bert

### Loading Bert

In [4]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')


In [5]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Preparing the dataset

#### Tokenizing

In [6]:
tokenized = ekg_denoised['TEXT'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))


#### Padding

In [7]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [8]:
np.array(padded).shape

(6536, 176)

#### Masking

In [9]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(6536, 176)

### Generating Features

In [10]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

In [None]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [11]:
features = last_hidden_states[0][:,0,:].numpy()

NameError: name 'last_hidden_states' is not defined