In [1]:
import os

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
# from transformers import DistilBertModel 
import pandas as pd
import pickle

In [2]:
import pickle
data_dir = 'C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/data'
with open(os.path.join(data_dir, 'query_data_train_top_700.pkl'), 'rb') as pkl:
    query_data = pickle.load(pkl)
with open(os.path.join(data_dir, 'labels_top_700.pkl'), 'rb') as pkl:
    labels = pickle.load(pkl)
# with open('/content/acc_ids_top_700.pkl', 'rb') as pkl:
#     acc_ids = pickle.load(pkl)

In [3]:
label2idx = dict(zip(list(set(labels)), [i for i in range(len(list(set(labels))))]))
idxs = [label2idx[i] for i in labels]
idxs[7000:7010], max(idxs)

([69, 69, 69, 69, 69, 69, 69, 69, 69, 69], 102)

In [4]:
len(idxs)

112401

In [5]:
freq = {}
for i in label2idx.keys():
    freq[i] = labels.count(i)
freq

{'bacterial ribosomal protein bL12 family': 735,
 'universal ribosomal protein uL30 family': 714,
 'bacterial ribosomal protein bL27 family': 754,
 'phosphohexose mutase family': 748,
 'ATPase B chain family': 867,
 'GHMP kinase famil': 1045,
 'universal ribosomal protein uS12 family': 913,
 'purine/pyrimidine phosphoribosyltransferase famil': 925,
 'bacterial ribosomal protein bL35 family': 716,
 'SecA family': 768,
 'radical SAM superfamil': 2248,
 'class-III pyridoxal-phosphate-dependent aminotransferase famil': 864,
 'AB hydrolase superfamil': 721,
 'chaperonin (HSP60) family': 973,
 'universal ribosomal protein uL1 family': 826,
 'IPP transferase family': 759,
 'class-I aminoacyl-tRNA synthetase family': 2498,
 'TRAFAC class OBG-HflX-like GTPase superfamil': 814,
 'globin family': 834,
 'enolase family': 835,
 'thioester dehydratase famil': 718,
 'G-protein coupled receptor 1 family': 972,
 'adenylosuccinate synthetase family': 767,
 'TRAFAC class translation factor GTPase superfa

In [6]:
with open(os.path.join(data_dir, 'split_data/query_data_train.pkl'), 'rb') as pkl:
    query_data_train = pickle.load(pkl)
with open(os.path.join(data_dir, 'split_data/labels_train.pkl'), 'rb') as pkl:
    labels_train = pickle.load(pkl)
with open(os.path.join(data_dir, 'split_data/query_data_test.pkl'), 'rb') as pkl:
    query_data_test = pickle.load(pkl)
with open(os.path.join(data_dir, 'split_data/labels_test.pkl'), 'rb') as pkl:
    labels_test = pickle.load(pkl)

In [7]:
df_test = pd.DataFrame(zip(query_data_test, labels_test), columns=['Query', 'Family'])
df= pd.DataFrame(zip(query_data_train, labels_train), columns=['Query', 'Family'])
df.head()

Unnamed: 0,Query,Family
0,MAVP DR RV SKTR AAKR RTHY SV KLAK PI KAK DGT W...,bacterial ribosomal protein bL32 family
1,MNI VILAA GTGK RM RSAL PKVL HPLA GRPLL SH VIDT...,transferase hexapeptide repeat family
2,MAHKKAGG SSRNGRDS QS KRLGVK KFGG EAV IAGN IIVR...,bacterial ribosomal protein bL27 family
3,MSA FI VLGA QWGDEGKGK MT DYL AEEAE VV VRFQG GN...,adenylosuccinate synthetase family
4,MV HW ADY IA DKII RERG EKEK YVVE SG ITPS GYVH ...,class-I aminoacyl-tRNA synthetase family


In [8]:
possible_labels = df.Family.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict
df['Label'] = df.Family.replace(label_dict)
df_test['Label'] = df_test.Family.replace(label_dict)
df.head()

Unnamed: 0,Query,Family,Label
0,MAVP DR RV SKTR AAKR RTHY SV KLAK PI KAK DGT W...,bacterial ribosomal protein bL32 family,0
1,MNI VILAA GTGK RM RSAL PKVL HPLA GRPLL SH VIDT...,transferase hexapeptide repeat family,1
2,MAHKKAGG SSRNGRDS QS KRLGVK KFGG EAV IAGN IIVR...,bacterial ribosomal protein bL27 family,2
3,MSA FI VLGA QWGDEGKGK MT DYL AEEAE VV VRFQG GN...,adenylosuccinate synthetase family,3
4,MV HW ADY IA DKII RERG EKEK YVVE SG ITPS GYVH ...,class-I aminoacyl-tRNA synthetase family,4


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.Label.values, 
                                                  test_size=0.25, 
                                                  random_state=42, 
                                                  stratify=df.Label.values)
df['data_type'] = ['not_set']*df.shape[0]
df_test['data_type'] = ['test']*df_test.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df = pd.concat([df, df_test])
df.groupby(['Family', 'Label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Query
Family,Label,data_type,Unnamed: 3_level_1
AB hydrolase superfamil,89,test,146
AB hydrolase superfamil,89,train,431
AB hydrolase superfamil,89,val,144
ABC transporter superfamil,48,test,546
ABC transporter superfamil,48,train,1643
...,...,...,...
universal ribosomal protein uS7 family,79,train,615
universal ribosomal protein uS7 family,79,val,205
universal ribosomal protein uS8 family,77,test,221
universal ribosomal protein uS8 family,77,train,533


In [18]:
total = len(list(df['data_type']))
test_count = list(df['data_type']).count('test')
train_count = list(df['data_type']).count('train')
val_count = list(df['data_type']).count('val')
val_count/total, test_count/total, train_count/total

(0.19999822065639986, 0.20000711737440058, 0.5999946619691996)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type=='test'].Query.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    truncation=True,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].Label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].Label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].Label.values)


dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

TypeError: batch_text_or_text_pairs has to be a list (got <class 'numpy.ndarray'>)

In [None]:
tensor_dir = 'C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/data/Tensor Data'
torch.save(dataset_train, os.path.join(tensor_dir, 'dataset_train.pt'))
torch.save(dataset_test, os.path.join(tensor_dir, 'dataset_test.pt'))
torch.save(dataset_val, os.path.join(tensor_dir, 'dataset_val.pt'))

In [6]:
tensor_dir = 'C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/data/Tensor Data'
dataset_train = torch.load(os.path.join(tensor_dir, 'dataset_train.pt'))
dataset_test = torch.load(os.path.join(tensor_dir, 'dataset_test.pt'))
dataset_val = torch.load(os.path.join(tensor_dir, 'dataset_val.pt'))

In [7]:
with open(os.path.join(tensor_dir, 'label_dict.pkl'), 'rb') as pkl:
    label_dict = pickle.load(pkl)

In [8]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 1

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

In [10]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 4

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)


In [11]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [12]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()
device

device(type='cuda')

In [13]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [21]:
a = np.array([1,2,3])
b = np.array([1,5,3])
c = a==b
cw

array([ True, False,  True])

In [22]:
import random
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(5, 10)):
    correct = 0
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       
        outputs = model(**inputs)
        if torch.argmax(outputs[1][0]) == batch[2]:
            correct += 1
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/models/new_finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)  
    train_acc = correct/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training accuracy: {train_acc}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    pred_vals = [np.argmax(pred) for pred in predictions]
    val_acc = accuracy_score(pred_vals, true_vals)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=67440.0, style=ProgressStyle(description_wi…




KeyboardInterrupt: 

In [24]:
len(outputs[1][0])

103

In [23]:
import random
import numpy as np
from sklearn.metrics import accuracy_score
import random
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
model.load_state_dict(torch.load('C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/models/finetuned_BERT_epoch_4.model'))

<All keys matched successfully>

In [24]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [33]:
overall_pred_vals = []
overall_true_vals = []

In [30]:
from sklearn.metrics import normalized_mutual_info_score
import pickle

test_loss, predictions, true_vals = evaluate(dataloader_test)
# test_f1 = f1_score_func(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]
test_clf_report = classification_report(pred_vals, true_vals)
nmi = normalized_mutual_info_score(pred_vals, true_vals,average_method='arithmetic')
test_acc = accuracy_score(pred_vals, true_vals)
# test_clf_report['accuracy'] = test_acc
# test_clf_report['nmi'] = nmi

TypeError: 'str' object does not support item assignment

In [31]:
with open('C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/models/test_classificarion_report.pkl', 'wb') as pkl:
    pickle.dump(test_clf_report, pkl)
    
print(f'Test loss: {test_loss}')
# print(f'F1 Score (Weighted): {test_f1}')
print(f'Test accuracy : {test_acc}')
print(f'Normalized Mutual Information Score : {nmi}')

Test loss: 0.17466832409920596
Test accuracy : 0.9770472843734709
Normalized Mutual Information Score : 0.9699116720978076


In [35]:
overall_pred_vals = overall_pred_vals + list(pred_vals)
overall_true_vals = overall_true_vals + list(true_vals)

NameError: name 'pred_values' is not defined

In [None]:
val_loss, predictions, true_vals = evaluate(dataloader_validation)
# val_f1 = f1_score_func(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]
val_clf_report = classification_report(pred_vals, true_vals)
nmi = normalized_mutual_info_score(pred_vals, true_vals,average_method='arithmetic')
val_acc = accuracy_score(pred_vals, true_vals)
val_clf_report['accuracy'] = val_acc
val_clf_report['nmi'] = nmi

with open('C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/models/val_classification_report.pkl', 'wb') as pkl:
    pickle.dump(val_clf_report, pkl)

print(f'Validation loss: {val_loss}')
# print(f'F1 Score (Weighted): {val_f1}')
print(f'Validation accuracy : {val_acc}')
print(f'Normalized Mutual Information Score : {nmi}')

In [None]:
overall_pred_values = overall_pred_values + list(pred_values)
overall_true_vals = overall_true_vals + list(true_vals)

In [None]:
train_loss, predictions, true_vals = evaluate(dataloader_train)
# train_f1 = f1_score_func(predictions, true_vals)
pred_vals = [np.argmax(pred) for pred in predictions]
train_clf_report = classification_report(pred_vals, true_vals)
nmi = normalized_mutual_info_score(pred_vals, true_vals,average_method='arithmetic')
train_acc = accuracy_score(pred_vals, true_vals)
train_clf_report['accuracy'] = train_acc
train_clf_report['nmi'] = nmi

with open('C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/models/train_classification_report.pkl', 'wb') as pkl:
    pickle.dump(train_clf_report, pkl)
print(f'Train loss: {train_loss}')
# print(f'F1 Score (Weighted): {train_f1}')
print(f'Train accuracy : {train_acc}')
print(f'Normalized Mutual Information Score : {nmi}')

In [None]:
overall_pred_values = overall_pred_values + list(pred_values)
overall_true_vals = overall_true_vals + list(true_vals)

In [None]:
overall_clf_report = classification_report(overall_pred_vals, overall_true_vals)
acc = accuracy_score(overall_pred_values, overall_true_vals)
nmi = normalized_mutual_info_score(overall_pred_values, overall_true_vals, average_method='arithmetic')
overall_clf_report['accuracy'] = acc
overall_clf_report['nmi'] = nmi

with open('C:/Users/chahabiscuit/Miscellaneous_Notebooks/Protein Family Prediction/models/overall_classification_report.pkl', 'wb') as pkl:
    pickle.dump(overall_clf_report, pkl)
    
print(f'Overall Accuracy : {acc}')
print(f'Overall Normalized Mutual Information Score : {nmi}')

In [None]:
pred_vals = pred_vals + [np.argmax(pred) for pred in predictions]

overall_nmi = normalized_mutual_info_score(pred_vals, true_vals,average_method='arithmetic')
overall_acc = accuracy_score(pred_vals, true_vals)


In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)