# Protein Secondary Structure Prediction

## Environment Setup

### Install required packages

In [21]:
!pip install datasets



In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataset Prepration

In [23]:
!wget -O /content/dataset.rar https://github.com/NLP-Final-Projects/SSPP/blob/main/Phase%202%20-%20Preprocessing%20Data/dataset.rar?raw=true

--2024-08-16 15:11:24--  https://github.com/NLP-Final-Projects/SSPP/blob/main/Phase%202%20-%20Preprocessing%20Data/dataset.rar?raw=true
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/NLP-Final-Projects/SSPP/raw/main/Phase%202%20-%20Preprocessing%20Data/dataset.rar [following]
--2024-08-16 15:11:24--  https://github.com/NLP-Final-Projects/SSPP/raw/main/Phase%202%20-%20Preprocessing%20Data/dataset.rar
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/NLP-Final-Projects/SSPP/main/Phase%202%20-%20Preprocessing%20Data/dataset.rar [following]
--2024-08-16 15:11:24--  https://raw.githubusercontent.com/NLP-Final-Projects/SSPP/main/Phase%202%20-%20Preprocessing%20Data/dataset.rar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.10

In [24]:
!apt-get install unrar

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unrar is already the newest version (1:6.1.5-1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [25]:
!unrar x /content/dataset.rar /content/


UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from /content/dataset.rar


Would you like to replace the existing file /content/content/dataset.csv
1166179089 bytes, modified on 2024-07-30 06:07
with a new one
1166179089 bytes, modified on 2024-07-30 06:07

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit n

No files to extract


### Load dataset

In [26]:
dataset_path = "/content/content/dataset.csv"

In [27]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("csv", data_files=dataset_path).select_columns(['Functionality', 'Secondary Structures Q8'])['train'].select(range(120)).train_test_split(test_size=0.2)

In [28]:
ds

DatasetDict({
    train: Dataset({
        features: ['Functionality', 'Secondary Structures Q8'],
        num_rows: 96
    })
    test: Dataset({
        features: ['Functionality', 'Secondary Structures Q8'],
        num_rows: 24
    })
})

In [29]:
ds_testval = ds['test'].train_test_split(test_size=0.5)

In [30]:
dataset = DatasetDict({
    'train': ds['train'],
    'test': ds_testval['test'],
    'validation': ds_testval['train']
})

### Preprocess the dataset

In [31]:
import ast

def get_sec_struct(example):
    sec_struct_str = example.get('Secondary Structures Q8')
    if sec_struct_str is None or 'None' in sec_struct_str:
        return {'Q8_labels': []}

    return {
        'Q8_labels':[e[1] for e in ast.literal_eval(example['Secondary Structures Q8'])]
    }

In [32]:
extended_dataset = dataset.map(get_sec_struct)

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [33]:
extended_dataset

DatasetDict({
    train: Dataset({
        features: ['Functionality', 'Secondary Structures Q8', 'Q8_labels'],
        num_rows: 96
    })
    test: Dataset({
        features: ['Functionality', 'Secondary Structures Q8', 'Q8_labels'],
        num_rows: 12
    })
    validation: Dataset({
        features: ['Functionality', 'Secondary Structures Q8', 'Q8_labels'],
        num_rows: 12
    })
})

## Training

### Loading model

In [34]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoints = ['google-t5/t5-small', 'google/flan-t5-small', 'microsoft/kosmos-2.5', 'google-t5/t5-base']

tokenizer = AutoTokenizer.from_pretrained(checkpoints[0])
# model = AutoModelForSeq2SeqLM.from_pretrained(checkpoints[0])
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/University/NLP2/t5tunned")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

### Prepare data

In [35]:
import torch
def preprocess_function(examples):
    inputs = tokenizer(examples['Functionality'], truncation=True)
    labels = tokenizer.convert_tokens_to_ids(examples['Q8_labels']) + [tokenizer.eos_token_id]

    inputs['labels'] = labels if len(labels) < tokenizer.model_max_length else labels[:tokenizer.model_max_length]
    # with tokenizer.as_target_tokenizer():
    #     inputs['labels'] = tokenizer(examples['Q8_labels'], truncation=True, padding=True).input_ids
    return inputs

In [36]:
tokenized_dataset = extended_dataset.map(preprocess_function).remove_columns(['Functionality', 'Secondary Structures Q8', 'Q8_labels'])

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [37]:
tokenized_dataset['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [38]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Train

In [98]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/University/NLP/results/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=500,
    # predict_with_generate=True,
    logging_steps=50,
    gradient_accumulation_steps=32
)



In [99]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [100]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.420039
2,No log,0.434423
3,No log,0.422537
4,No log,0.419251
5,No log,0.421616
6,No log,0.428215
7,No log,0.423749
8,No log,0.41559
9,No log,0.412754
10,No log,0.414766


Epoch,Training Loss,Validation Loss
1,No log,0.420039
2,No log,0.434423
3,No log,0.422537
4,No log,0.419251
5,No log,0.421616
6,No log,0.428215
7,No log,0.423749
8,No log,0.41559
9,No log,0.412754
10,No log,0.414766


TrainOutput(global_step=500, training_loss=0.0777729468345642, metrics={'train_runtime': 1718.0473, 'train_samples_per_second': 27.939, 'train_steps_per_second': 0.291, 'total_flos': 877395522355200.0, 'train_loss': 0.0777729468345642, 'epoch': 400.0})

## Samples

In [101]:
sample_index=6
''.join(extended_dataset['train']['Q8_labels'][sample_index])

'--HHHHHHHHT--EEEEEE-TTS-EEEETTEEEESSS-HHHHHHHHHHHHTS--SSB--HHHHHHHHHHHHHHHHHHHHT-TTTHHHHHHS-HHHHHHHHHHHHHHHHHHHHT-HHHHHHHHTT-HHHHHHHHHTSHHHHHSHHHHHHHHHHHHHSSSHHHH----HHHHHHHHT--EEEEEE-TT--EEEETTEEEESS--HHHHHHHHHHHHSS--SSB--HHHHHHHHHHHHHHHHHHHHH-TTTHHHHHT--HHHHHHHHHHHHHHHHHHHHT-HHHHHHHHTT-HHHHHHHHHTSHHHHHSHHHHHHHHHHHHHSSGGGT-'

In [102]:
generated_out = model.generate(input_ids=torch.tensor(tokenized_dataset['train']['input_ids'][sample_index], device=model.device).unsqueeze(0), max_length=512)
generated_out[0]

tensor([  0,  18,  18, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,  18,
         18, 427, 427, 427, 427, 427, 427,  18, 382, 382, 134,  18, 427, 427,
        427, 427, 382, 382, 427, 427, 427, 427, 134, 134, 134,  18, 566, 566,
        566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 382, 134,  18,
         18, 382, 382, 279,  18,  18, 566, 566, 566, 566, 566, 566, 566, 566,
        566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
        566, 566, 566, 566,  18, 382, 382, 382, 566, 566, 566, 566, 566, 566,
        566, 134,  18, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
        566, 566, 566, 382,  18, 566, 566, 566, 566, 566, 566, 566, 566, 566,
        382, 382,  18, 566, 566, 566, 566, 566, 566, 566, 566, 134, 134, 566,
        566, 566, 566, 134, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566,
        566, 566, 566, 566, 134, 134, 134, 517, 517, 517,  18,   1],
       device='cuda:0')

In [103]:
tokenizer.decode(generated_out[0], skip_special_tokens=True)

'--HHHHHHHHHH--EEEEEE-TTS-EEEETTEEEESSS-HHHHHHHHHHHHHTS--TTB--HHHHHHHHHHHHHHHHHHHHHHHHHH-TTTHHHHHHHS-HHHHHHHHHHHHHHT-HHHHHHHHHTT-HHHHHHHHSSHHHHSHHHHHHHHHHHHHHSSSGGG-'

In [104]:
tokenizer.convert_ids_to_tokens(generated_out[0])

['<pad>',
 '-',
 '-',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 '-',
 '-',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 '-',
 'T',
 'T',
 'S',
 '-',
 'E',
 'E',
 'E',
 'E',
 'T',
 'T',
 'E',
 'E',
 'E',
 'E',
 'S',
 'S',
 'S',
 '-',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'T',
 'S',
 '-',
 '-',
 'T',
 'T',
 'B',
 '-',
 '-',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 '-',
 'T',
 'T',
 'T',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'S',
 '-',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'T',
 '-',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'T',
 'T',
 '-',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'S',
 'S',
 'H',
 'H',
 'H',
 'H',
 'S',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'H',
 'S',
 'S',
 'S',
 'G',
 'G',
 'G',
 '-',
 '</s>

In [105]:
model.save_pretrained("/content/drive/MyDrive/University/NLP/t5tunned")

### Edit Distance Calculation

In [112]:
def match(a,b):
    if a == b:
        return(0)
    return(1)

def edit_distant(s,t):
  M = [[0 for j in range(len(t) + 1)] for i in range(len(s) + 1)]

  for i in range(1,len(s) + 1):
      M[i][0] = i
  for i in range(1, len(t) + 1):
      M[0][i] = i
  for i in range(1, len(s) + 1):
      for j in range(1, len(t) + 1):
          if s[i-1] == t[j-1]:
              cost = 0
          else:
              cost = 1
          M[i][j] = min(M[i-1][j] + 1, M[i][j-1] + 1, M[i-1][j-1] + cost)

  print(M[len(s)][len(t)])
  return M

def show(s,t,M):
  s_prim = ''
  t_prim = ''
  i = len(s)
  j = len(t)

  while i*j != 0:
      if M[i][j] == M[i-1][j-1] + match(s[i-1], t[j-1]):
          s_prim = s[i-1] + s_prim
          t_prim = t[j-1] + t_prim
          i -= 1
          j -= 1
      elif i > 0 and M[i][j] == M[i-1][j] + 1:
          s_prim = s[i-1] + s_prim
          t_prim = '*' + t_prim
          i -= 1
      else:
          t_prim = t[j-1] + t_prim
          s_prim = '*' + s_prim
          j -= 1
  print(s_prim)
  print(t_prim)

def process_text(text):
  temp = [text[0]]
  for word in text[1:]:
    if temp[-1] != word:
      temp.append(word)

  return temp



In [114]:
edds = []
nors = []
for i in range(len(extended_dataset['train'])):
    generated_out = model.generate(input_ids=torch.tensor(tokenized_dataset['train']['input_ids'][i], device=model.device).unsqueeze(0), max_length=512)
    lab = process_text(''.join(extended_dataset['train']['Q8_labels'][i]))
    out = process_text(tokenizer.decode(generated_out[0], skip_special_tokens=True))
    edd = edit_distant(lab, out)[-1][-1]
    nor = 1 - (edd/(len(lab) + len(out)))
    edds.append(edd)
    nors.append(nor)

85
1
2
49
82
6
44
8
9
0
3
1
4
1
1
180
0
0
12
125
38
0
68
90
0
110
2
1
0
3
1
2
224
2
0
2
12
0
178
0
44
1
1
1
215
2
2
64
1
5
49
1
87
3
0
3
2
4
85
1
111
5
1
87
1
1
44
1
90
0
10
0
0
11
5
112
1
0
4
3
82
4
0
13
3
0
0
87
0
5
1
0
1
60
0


In [115]:
print(sum(edds)/len(edds))
print(sum(nors)/len(nors))

27.729166666666668
0.7928615604197525
