In [2]:
from Bio import SeqIO #parse fasta files
import os #change directories
import time #track processing time
import pandas as pd #create dataframes
from datasets import load_dataset #load dataframe into huggingface dataset
from sklearn import preprocessing #encode labels

In [2]:
# original directory
print(os.getcwd())
os.chdir("/mnt/storage/grid/home/eric/hmm2bert")
print(os.getcwd())

/mnt/storage/grid/home/eric/hmm2bert
/mnt/storage/grid/home/eric/hmm2bert


In [70]:
# melanin directory
print(os.getcwd())
os.chdir("/mnt/storage/grid/home/eric/puffin/dat/melanin")
print(os.getcwd())

/mnt/storage/grid/home/eric/puffin/dat/melanin
/mnt/storage/grid/home/eric/puffin/dat/melanin


In [87]:
# ladderane directory
print(os.getcwd())
os.chdir("/mnt/storage/grid/home/eric/puffin/dat/ladderane")
print(os.getcwd())

/mnt/storage/grid/home/eric/puffin/dat/melanin
/mnt/storage/grid/home/eric/puffin/dat/ladderane


# ==============================================================

In [3]:
# walk through domain folder and select only files ending with .fasta

folder = "melanin"
counter = 0
for root, dirs, files in os.walk(f"/mnt/storage/grid/home/eric/PuffinCaller/PuffinCaller/dat/fasta/{folder}"):
    for x in files:
        if x.endswith(".fasta"):
            counter += 1
            print(x)
print(counter)

MelC1.fasta
MelC2.fasta
2


In [None]:
# indicate file to parse, and class/label it belongs under

filename = "LadMT.hmm.fasta"
Class = "ladderane"

In [96]:
#define empty lists for parsing fasta

seq_ID = []
seq_Record = []
label_ID = []

In [97]:
# parse through fasta, separate single letter amino acid sequence by whitespace and append data to list

for seq_record in SeqIO.parse(filename, "fasta"):
    seq_ID.append(seq_record.id)
    str_seq = str(seq_record.seq)
    seq_Record.append(" ".join(str_seq))
    label_ID.append(Class)
    
    #seq_Record.append(seq_record.seq)
    #print(seq_record.id)
    #print(repr(seq_record.seq))
    #print(len(seq_record))

In [98]:
#verify presence of data

print(seq_ID[0])
print(seq_Record[0])
print(len(seq_Record[0]))
print(label_ID[0])
print("")
print("")
print(seq_ID[-1])
print(seq_Record[-1])
print(len(seq_Record[-1]))
print(label_ID[-1])

Lad_MT-sample1
L Y K T S S S V V E T F L L L L I D A Q E Q L P F Y Q K P Q K V R V S D E E G L E N L V Q S T T M H P L M S A L D E R E Q V A E F E I R D E V S K R V V S V D I D C G S D C S Y V L N D Y T T T K W T N D K F D I V C L S A K E I K V R N N A I L K K Y S F L D T E S E E R A E N F M A G I R Q L G L F S S E T V H C T D G T G Q V E F Q F I D S A L I K V V T A N A A N P T S R I I D F R Y G E G F D V N S K T K R L Q P I R L M R C S F E C H V F K M R A K S A L K N G N Q F P Y Q V S W V K T Y S W D V F E I L V C L P D Q W T K L V I I A Q C A N P T A P V K I K V Y K G S E L Y S V T I C G E R G I N N E S H S D Q G R I L S V D M L V R K G V L E Y K V A H A V C R A P Q G F G N Q Q L S R Q N M L H E I G Y L L E S F I Q N M P K Q K L K R K
731
ladderane


Lad_MT-sample100
G R A R P D V D F A S Q R I S K A I S S D Q F I A F K N V L R F I H N K C I L S I L E S T E E V G L I N A I G M L T D K G T T K N E D M V L T H L V K T V E D Y M R H C K I T E H Q N R M T V F F D H I A L V I K V P D S F 

In [99]:
# create dataframe with required columns
df_records = pd.DataFrame(columns=['sequence_ID', 'sequences', 'labels'])
print(df_records.columns)

Index(['Sequence_ID', 'Sequences', 'Labels'], dtype='object')


In [100]:
# create dict with the data that was appended to the lists (so that they can be added to dataframe)

preDF_dict = {'sequence_ID': seq_ID, 'sequences': seq_Record, 'labels': label_ID}
print(len(preDF_dict["sequences"]))

100


In [101]:
# pass dict into DataFrame

df_records = pd.DataFrame(preDF_dict)
print(df_records.head())
print("")
print(df_records.tail())

      Sequence_ID                                          Sequences  \
0  Lad_MT-sample1  L Y K T S S S V V E T F L L L L I D A Q E Q L ...   
1  Lad_MT-sample2  D H S G K T Y G I P D I K P V A R Y N I P Q E ...   
2  Lad_MT-sample3  N W S W Q P Q L E E V K A L F N V L T L Q I F ...   
3  Lad_MT-sample4  D H N G E N A T L N K C R C L I T K A K V Q K ...   
4  Lad_MT-sample5  Q N M R H S K P T N P E R E L L S S Y A S K K ...   

      Labels  
0  ladderane  
1  ladderane  
2  ladderane  
3  ladderane  
4  ladderane  

         Sequence_ID                                          Sequences  \
95   Lad_MT-sample96  K E V K T L K V N E F E K N V Q T M I L P F R ...   
96   Lad_MT-sample97  R R K A E N Q A V G V V N K L L Q T R D R K L ...   
97   Lad_MT-sample98  L S M L E R K G W A V K G S V I R A I F G K H ...   
98   Lad_MT-sample99  F N E Q E V C V P E I T T L I A Q L T P C T F ...   
99  Lad_MT-sample100  G R A R P D V D F A S Q R I S K A I S S D Q F ...   

       Labels  
95  ladde

In [102]:
# change index column to Sequence_ID column

df_records = df_records.set_index("sequence_ID")
print(df_records.head())
print(df_records.tail())

                                                        Sequences     Labels
Sequence_ID                                                                 
Lad_MT-sample1  L Y K T S S S V V E T F L L L L I D A Q E Q L ...  ladderane
Lad_MT-sample2  D H S G K T Y G I P D I K P V A R Y N I P Q E ...  ladderane
Lad_MT-sample3  N W S W Q P Q L E E V K A L F N V L T L Q I F ...  ladderane
Lad_MT-sample4  D H N G E N A T L N K C R C L I T K A K V Q K ...  ladderane
Lad_MT-sample5  Q N M R H S K P T N P E R E L L S S Y A S K K ...  ladderane
                                                          Sequences     Labels
Sequence_ID                                                                   
Lad_MT-sample96   K E V K T L K V N E F E K N V Q T M I L P F R ...  ladderane
Lad_MT-sample97   R R K A E N Q A V G V V N K L L Q T R D R K L ...  ladderane
Lad_MT-sample98   L S M L E R K G W A V K G S V I R A I F G K H ...  ladderane
Lad_MT-sample99   F N E Q E V C V P E I T T L I A Q L T P C T F ..

In [103]:
# APPEND data to EXISTING csv

df_records.to_csv("/mnt/storage/grid/home/eric/hmm2bert/data_prep/mel_lad.csv", mode="a", header=False)


# SAVE dataframe as a NEW csv
#df_records.to_csv("/mnt/storage/grid/home/eric/hmm2bert/data_prep/melanin_ladderane.csv")

# Encode labels

In [16]:
df = pd.read_csv("/mnt/storage/grid/home/eric/hmm2bert/data_prep/mel_lad.csv")

le = preprocessing.LabelEncoder()
new_labels = le.fit_transform(df['labels'])

new_label_dict = {"sequence_ID": df['sequence_ID'], "sequences": df['sequences'], "labels": new_labels}

labelled_df = pd.DataFrame(new_label_dict)

In [17]:
labelled_df = labelled_df.set_index("sequence_ID")

labelled_df.to_csv("/mnt/storage/grid/home/eric/hmm2bert/data_prep/mel_lad_labelled.csv")