### Imports

In [None]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import pipeline

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
import math
import torch
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

### Generation

In [None]:
"""
Load in fine-tuned ProtGPT2 model and tokenizer. Generate with a higher repetition penality of 1.5 to prevent the model from generating repetitive sequences.
A broader range of sequences will increase the likelihood of stable anti-crisprs.
"""

In [None]:
# load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Duke/Freshman Year/Sem 2/BME 590/Shrey Goel/Individual Project 2A/finetuned_protgpt2_model"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# generate sequences
protgpt2 = pipeline('text-generation', model=model_path)
sequences10 = protgpt2("<|endoftext|>", max_length=100, do_sample=True, top_k=950, repetition_penalty=1.5, num_return_sequences=1000, eos_token_id=0)

In [None]:
# store only sequences from generation output in a list
gen_seqs10 = []
for item in sequences10:
  gen_seqs10.extend(item.values())

In [None]:
"""
Perplexity is calculated as a way to assess the model's confidence in its generated sequences and how confused it is in its outputs.
"""

In [None]:
# function to calculate perplexity of each generated sequence
def calculatePerplexity(sequence, model, tokenizer):
    input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0)
    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    return math.exp(loss)

# calculate pPL for sequences
gen_ppls10 = []
for sequence in gen_seqs10:
  ppl = calculatePerplexity(sequence, model.to(device), tokenizer)
  gen_ppls10.append(ppl)

In [None]:
# save sequences and associated pPL to df
import pandas as pd

df10 = pd.DataFrame(columns=['Sequence', 'Perplexity'])
df10['Sequence'] = gen_seqs10
df10['Perplexity'] = gen_ppls10

In [None]:
df10

Unnamed: 0,Sequence,Perplexity
0,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAITVTPSGEVSH...,15.110159
1,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTPSGEVSH...,1.011107
2,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAITVTPSGEVSH...,1.388807
3,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTPSGEVSH...,1.011107
4,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTPSGEVSH...,1.557050
...,...,...
995,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIFATPEGEVSH...,8.729079
996,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTPSGEVSH...,1.011107
997,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTPSGEVSH...,5.464747
998,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTPSGEVSH...,1.011107


In [None]:
import pickle
dir = "/content/drive/MyDrive/Duke/Freshman Year/Sem 2/BME 590/Shrey Goel/Individual Project 2A/finetuned_protgpt2_gen_results/"
with open(dir + 'df10.pkl', 'wb') as f:
    pickle.dump(df10, f)