### Imports

In [None]:
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
dir = "/content/drive/MyDrive/Duke/Freshman Year/Sem 2/BME 590/Shrey Goel/Individual Project 2A"

In [None]:
df = pd.read_csv(dir + "/anti_crispr_cleaned.csv").drop(columns = ["Unnamed: 0", "index", "ID"])

In [None]:
df

Unnamed: 0,Sequence,Anti-CRISPR?,length
0,MNSYLLLLMVSLLTCIGQLCQKQAAQCWEQPQARRLNLTLRWLAIA...,CRISPR,114
1,MPDSSTALRILVYSDNVQTRERVMRALGKRLHPDLPDLTYVEVATG...,CRISPR,133
2,MPSFDIVSEVDLQEARNAVDNASREVESRFDFRNVEASFELNDASK...,CRISPR,163
3,MIIVYISLAVLAVSIIFLGVTVIQNKKKIDPALKELSSVTQAMQKQ...,CRISPR,101
4,MSVIQDDYVKQAEVIRGLPKKKNGFELTTTQLRVLLSLTAQLFDEA...,CRISPR,124
...,...,...,...
3989,MAHNHWCNLFSVALVCVVALVMVQYSVAQNSPQDYVDAHNAARSAV...,CRISPR,167
3990,MVAVYPGSFDPITLGHVDIIKRALSIFDELVVLITENPRKRCLFSL...,CRISPR,161
3991,MSMVKEFKEFALKGNVMDLAVGVIIGGAFSTIVNSIVKDLIMPVVG...,CRISPR,148
3992,MLDQQTINIIKATVPVLKEHGVTITTTFYKNLFAKHPEVRPLFDMG...,CRISPR,146


In [None]:
# Function to get only certain type of sequence from data
def model_prep(df, filter_value, new_df_name):
    filtered_df = df[df['Anti-CRISPR?'] == filter_value]
    filtered_df = filtered_df.reset_index(drop=True)
    filtered_df.name = new_df_name

    return filtered_df

In [None]:
# Get dfs with only crispr or anti-crispr sequences
crisp_df = model_prep(df, "CRISPR", "crisp_df")
anti_df = model_prep(df, "Anti", "anti_df")

In [None]:
# save as csvs
crisp_df.to_csv(dir + "/finetuning_protgpt2_data/crisp_df.csv")
anti_df.to_csv(dir + "/finetuning_protgpt2_data/anti_df.csv")

### Fasta formatting of sequences based on ProtGPT2's HuggingFace

In [None]:
# format sequence inputs based on ProtGPT fine-tuning requirements
def modify_sequences(sequence):
  modified_sequence = sequence
  modified_sequence = '\n'.join([modified_sequence[i:i+60] for i in range(0, len(modified_sequence), 60)])

  fasta = "<|endoftext|>"
  modified_sequence = fasta + "\n" + modified_sequence

  return modified_sequence

# get the modified sequences
df['Sequence'] = df['Sequence'].apply(modify_sequences)

In [None]:
# format anti-crispr sequences
anti_df['Sequence'] = anti_df['Sequence'].apply(modify_sequences)

In [None]:
# random 80/20 train test split
train_df, test_df = train_test_split(anti_df, test_size=0.2, random_state=42)

In [None]:
train_df

Unnamed: 0,Sequence,Anti-CRISPR?,length
117,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAITVTPSGEVSH...,Anti,130
65,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVSPSGEVSH...,Anti,130
67,<|endoftext|>\nMDEIDDLSDLPMPRFIWGFAIFTPKGGEVMH...,Anti,128
31,<|endoftext|>\nMSSTISDRIISRSVIEAARFIQSWEDADPDN...,Anti,140
12,<|endoftext|>\nMNVFPIPDSLAEAYHGAGWALGAVLGGQLVA...,Anti,87
...,...,...,...
71,<|endoftext|>\nMDEIEDLSDLPMPRFIWGFAVIAGKSGEVMH...,Anti,126
106,<|endoftext|>\nMTTARKKFYQAISEFEAMTGKDVERTPQIAD...,Anti,130
14,<|endoftext|>\nMASVTTFPLPDNLARERIETGWALAAISGNQ...,Anti,95
92,<|endoftext|>\nMTLTDWIVDRVRHNTMNVFKVPQSLADKYHG...,Anti,103


In [None]:
test_df

Unnamed: 0,Sequence,Anti-CRISPR?,length
19,<|endoftext|>\nMSKNNIFKKYPPIIHGEARGENDEFVVHTRY...,Anti,123
42,<|endoftext|>\nMDEIDDLSDLPMPRFIWGFAVIANKGGDVMH...,Anti,127
153,<|endoftext|>\nTRYQLSKLTGISQNTLNDYNKKELNKYSVSF...,Anti,134
78,<|endoftext|>\nMFKRAIIFTSFNGFEKVSQTEKRQLAKIINS...,Anti,116
145,<|endoftext|>\nMTIKLLDEFLKKHDLTRYQLSKLTGISQNTL...,Anti,149
15,<|endoftext|>\nMYNKAEIMKQAWNWFTDSNVWLSDIEWASYT...,Anti,125
24,<|endoftext|>\nMDEIDELSDLPTPRFIWGFAIAVTQSGEVSH...,Anti,130
68,<|endoftext|>\nMDEIDDLSDLPMPRFIWGFAIFTPKGGEVMH...,Anti,128
113,<|endoftext|>\nLNDYNKKELNKYSVSFLRALSMCAGISTFDV...,Anti,119
118,<|endoftext|>\nMDRLSIYELLSFVVPGVIMIELINFSAEYVF...,Anti,124


In [None]:
# save train/test data as csv
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')

In [None]:
# function to data to txt files
def to_txt_file(df, filename):
  with open(filename, 'w') as f:
    for sequence in df['Sequence']:
      f.write(sequence + '\n')

In [None]:
# save train/test anti-crispr sequences as txt files
to_txt_file(train_df, 'train.txt')
to_txt_file(test_df, 'test.txt')