# 4. Custom sequence SASA prediction

## Setup

In [1]:
import math
import os
import pickle
import sys; sys.path.append('../..')

# import abnumber
import numpy as np
import pandas as pd
import subprocess

import bin.params as p
import bin.utils as u
import bin.feature_generators as fg

from Bio import SeqIO
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from tqdm.notebook import tqdm

In [2]:
TRAINED_MODELS_DIR_PATH = f'{p.DATA_DIR}/pickles/trained-test-models'

(TRAINED_MODELS_DIR_PATH, )

('../../data/pickles/trained-test-models',)

In [3]:
MODEL_NAME = 'randomforestN30'
FEATURES = 'lco_cont_window_r3_all_H'
PARAMS = {'compress': False, 'preserve_seq_ids': True}

In [4]:
# currently available (trained) models
os.listdir(TRAINED_MODELS_DIR_PATH)

['lco_cont_window_r3_all_H_randomForestN30.p',
 'lco_whole_sequence_all_H_BLmeansamerespos.p',
 'lco_cont_window_r0_all_H_randomForestN30.p',
 'lco_whole_sequence_all_H_BLknnwholeseqn10.p',
 'lco_cont_window_r4_all_H_randomForestN30.p',
 'lco_cont_window_r4_all_H_randomForestN5.p',
 'lco_cont_window_r2_all_H_randomForestN30.p',
 'lco_cont_window_r3_all_H_randomForestN5.p',
 'lco_whole_sequence_all_H_BLmediansamerespos.p',
 'lco_whole_sequence_all_H_BLavgpos.p',
 'lco_whole_sequence_all_H_BLknnwholeseqn3.p',
 'lco_cont_window_r1_all_H_randomForestN5.p',
 'lco_cont_window_r2_all_H_randomForestN5.p',
 'lco_cont_window_r1_all_H_randomForestN30.p']

In [5]:
# load the test dataset
X_orig, Y_orig = u.load_dataset('test')

load_dataset: test, metadata file path: ../../data/csv/metadata/metadata_L.csv, chains: L, shape: (643, 19)
load_dataset: test, X file path: ../../data/csv/fasta_aligned_cleaned/fasta_aho_L.csv, chains: L, shape: (405, 154)
load_dataset: test, Y file path: ../../data/csv/sasa_aligned/sasa_L.csv, chains: L, shape: (405, 154)


In [6]:
# load the trained model
model = None
model_file_path = f'{TRAINED_MODELS_DIR_PATH}/{FEATURES}_{MODEL_NAME}.p'
with open(model_file_path, 'rb') as trained_model_file:
    print('model loaded from:', model_file_path)
    model = pickle.load(trained_model_file)

model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r3_all_H_randomforestN30.p


## Define your sequence

In [7]:
# bud das FASTA file
# alebo len supnes sekvenciu

In [19]:
def get_model(model_name, features):
    model_file_path = f'{TRAINED_MODELS_DIR_PATH}/{features}_{model_name}.p'
    print('model_name:', model_name, '| features:', features)
    print('model loaded from:', model_file_path)
    with open(model_file_path, 'rb') as trained_model_file:
        model = pickle.load(trained_model_file)
        return model

def save_fasta(seq_id, seq_data, seq_desc=''):
    # Define the sequence data and identifier
    
    # Create a SeqRecord object
    seq_record = SeqRecord(Seq(seq_data), id=seq_id, description=seq_desc)
    
    # Write the SeqRecord to a FASTA file
    input_file_path = f"seq_{seq_id}.fasta"
    SeqIO.write(seq_record, input_file_path, "fasta")
    print(f"FASTA file saved as {input_file_path}")
    return input_file_path

def predict(fasta_path, anarci_output_path, model_name='randomforestN30', features='lco_cont_window_r3_all_H'):
    model = get_model(model_name, features)
    scheme = p.FINAL_NUMBERING_SCHEME
    anarci_command = f'anarci -i {fasta_path} -o {anarci_output_path} --csv --scheme={scheme}'
    print('ANARCI command:', anarci_command)
    subprocess.run(anarci_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
    
    newest_file_command = "ls -Art | tail -n 1"
    newest_file_command_result = subprocess.run(
        newest_file_command, 
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, 
        text=True, shell=True)
    newest_file_command_output = newest_file_command_result.stdout.rstrip() # remove the newline from the end of the line
    print('ROMAN newest file:', newest_file_command_output)
    newest_file_df = pd.read_csv(newest_file_command_output, index_col=0)
    print('newest_file_df.shape:', newest_file_df.shape)
    
    position_columns = u.get_position_columns(newest_file_df)
    cols_to_remove = [c for c in newest_file_df if c not in position_columns]
    newest_file_df = newest_file_df.drop(columns=cols_to_remove)
    newest_file_df_ids = newest_file_df.index
    print(newest_file_df.head(n=1))
    
    Xr, Yr = u.load_dataset('test_new_234', chains='H')
    Xr.index = Xr['Id']; Xr = Xr.drop(columns=['Id'])
    print(Xr.head(n=1))
    
    cols = set(Xr.columns).difference(set(newest_file_df.columns))
    print('columns to add:', list(cols))
    for col in cols:
        newest_file_df[col] = '-'
    newest_file_df = u.sort_numbering_columns(newest_file_df).reset_index() # (BLmeansaremerespos)
    print('newest_file_df.shape:', newest_file_df.shape)
    X_custom = newest_file_df
    print(X_custom.head(n=1))
    
    # transform 
    X_custom_final, _, _ = fg.generate(X_custom, Y=None, c=None,
                                       model_name=model_name, features=features, params=PARAMS)

    last_column = X_custom_final.columns[-1]
    print('Removing last column from X_custom_final. it probably contains IDs')
    print(X_custom_final[last_column])
    X_custom_final = X_custom_final.drop(columns=[last_column])

    print('X_custom_final right before the prediction:', X_custom_final.head(n=1))
    
    predictions = model.predict(X_custom_final)
    predictions[predictions == -1] = np.nan
    print('len(predictions):', len(predictions))

    # todo
    if type(predictions) is np.ndarray:
        N_SEQUENCES = X_custom.shape[0]
        ids = list(newest_file_df_ids)
        print('predictions type: np.ndarray | N_SEQUENCES:', N_SEQUENCES, '| len(ids):', len(ids))
        # convert to dataframe
        Y_pred = Yr.copy().head(newest_file_df.shape[0]).drop(columns=['Id'])
        Y_pred.index = ids
        #Y_pred.index = Y_orig['Id']
        #Y_pred.drop(columns='Id', inplace=True)
        for i, _ in tqdm(enumerate(predictions), total=len(predictions), 
                         desc='Processing individual predictions...'):
            seq_id = ids[i % N_SEQUENCES]
            x_index = math.floor(i / N_SEQUENCES)
            pos_id = newest_file_df.columns[x_index+1] # starting from 1 as 0 is 'id'
            Y_pred.loc[seq_id, pos_id] = predictions[i]
        Y_pred = Y_pred.replace(-1, np.nan)
        predictions = Y_pred
    else:
        # dataframe
        assert isinstance(predictions, pd.DataFrame)
        print('predictions type: pd.DataFrame')
        assert predictions.index[0] == 0
        predictions.index = newest_file_df_ids
        
    return predictions.round(2)

def predict_single_sequence(seq_id, seq_data, seq_desc='', model_name='randomforestN30', features='lco_cont_window_r3_all_H'):
    fasta_path = save_fasta(seq_id, seq_data, seq_desc)
    anarci_output_path = f"seq_{seq_id}_numbered"
    return predict(fasta_path, anarci_output_path, model_name, features)

In [20]:
sequence = 'VQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLEWIGYISPGNGDIKYNEKFKGKATLTADKSSSTAYMQLNSLTSEDSAVYLCKRGYYVDYWGQGTTLTVSSAKTTPPSVYPLAPSMVTLGCLVKGYFPEPVTVTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVPSSTWPSETVTCNVAHPASSTKVDKKIE'
predict_single_sequence("roman_seq", sequence, '', 'BLmeansamerespos', 'lco_whole_sequence_all_H')

FASTA file saved as seq_roman_seq.fasta
model_name: BLmeansamerespos | features: lco_whole_sequence_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_whole_sequence_all_H_BLmeansamerespos.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file pat

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
roman_seq,,33.06,54.88,5.95,54.57,7.57,42.21,,67.0,55.04,...,61.65,19.12,8.42,,40.87,3.1,18.14,5.22,22.0,71.71


In [21]:
predict("seq_seq1.fasta", "seq_seq1_numbered", 'BLmeansamerespos', 'lco_whole_sequence_all_H')

model_name: BLmeansamerespos | features: lco_whole_sequence_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_whole_sequence_all_H_BLmeansamerespos.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa_H.csv, chains: H, shape: (888, 165)
  

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq1,,33.06,54.88,5.95,54.57,7.57,42.21,,67.0,55.04,...,61.65,19.12,8.42,,40.87,3.1,18.14,5.22,22.0,71.71
seq2,,33.06,54.88,5.95,54.57,7.57,42.21,,67.0,55.04,...,61.65,19.12,8.42,,40.87,3.1,18.14,5.22,22.0,71.71


In [22]:
predict("seq_seq1.fasta", "seq_seq1_numbered")

model_name: randomforestN30 | features: lco_cont_window_r3_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r3_all_H_randomforestN30.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa_H.csv, chains: H, shape: (888, 165)
    

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
seq1,,58.31,64.64,5.53,54.27,7.97,32.6,,65.54,44.88,...,61.28,17.62,6.69,,41.71,3.05,20.44,5.24,21.74,72.09
seq2,,58.31,64.64,5.53,54.27,7.97,32.6,,65.54,44.88,...,61.28,17.62,6.69,,41.71,3.05,20.44,5.24,21.74,72.09


In [25]:
# testing - does it all run without an error?
pipeline_names = ['lco_cont_window_r3_all_H_randomForestN30.p',
 'lco_whole_sequence_all_H_BLmeansamerespos.p',
 'lco_cont_window_r0_all_H_randomForestN30.p',
 'lco_whole_sequence_all_H_BLknnwholeseqn10.p',
 'lco_cont_window_r4_all_H_randomForestN30.p',
 'lco_cont_window_r4_all_H_randomForestN5.p',
 'lco_cont_window_r2_all_H_randomForestN30.p',
 'lco_cont_window_r3_all_H_randomForestN5.p',
 'lco_whole_sequence_all_H_BLmediansamerespos.p',
 'lco_whole_sequence_all_H_BLavgpos.p',
 'lco_whole_sequence_all_H_BLknnwholeseqn3.p',
 'lco_cont_window_r1_all_H_randomForestN5.p',
 'lco_cont_window_r2_all_H_randomForestN5.p',
 'lco_cont_window_r1_all_H_randomForestN30.p']
for pipeline_name in pipeline_names:
    tokens = pipeline_name.split('_')
    model_name = tokens[-1]
    if model_name.endswith('.p'):
        model_name = model_name[:-2]
    features = '_'.join(tokens[:-1])
    print(model_name, features)
    predict("seq_seq1.fasta", "seq_seq1_numbered", model_name, features)
    sequence = 'VQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLEWIGYISPGNGDIKYNEKFKGKATLTADKSSSTAYMQLNSLTSEDSAVYLCKRGYYVDYWGQGTTLTVSSAKTTPPSVYPLAPSMVTLGCLVKGYFPEPVTVTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVPSSTWPSETVTCNVAHPASSTKVDKKIE'
    predict_single_sequence("roman_seq", sequence, '', model_name, features)

randomForestN30 lco_cont_window_r3_all_H
model_name: randomForestN30 | features: lco_cont_window_r3_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r3_all_H_randomForestN30.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN30 | features: lco_cont_window_r3_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r3_all_H_randomForestN30.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path:

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

BLmeansamerespos lco_whole_sequence_all_H
model_name: BLmeansamerespos | features: lco_whole_sequence_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_whole_sequence_all_H_BLmeansamerespos.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/s

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN30 | features: lco_cont_window_r0_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r0_all_H_randomForestN30.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path:

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

BLknnwholeseqn10 lco_whole_sequence_all_H
model_name: BLknnwholeseqn10 | features: lco_whole_sequence_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_whole_sequence_all_H_BLknnwholeseqn10.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/s

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN30 | features: lco_cont_window_r4_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r4_all_H_randomForestN30.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path:

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

randomForestN5 lco_cont_window_r4_all_H
model_name: randomForestN5 | features: lco_cont_window_r4_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r4_all_H_randomForestN5.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa_H.

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN5 | features: lco_cont_window_r4_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r4_all_H_randomForestN5.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: .

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

randomForestN30 lco_cont_window_r2_all_H
model_name: randomForestN30 | features: lco_cont_window_r2_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r2_all_H_randomForestN30.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN30 | features: lco_cont_window_r2_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r2_all_H_randomForestN30.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path:

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

randomForestN5 lco_cont_window_r3_all_H
model_name: randomForestN5 | features: lco_cont_window_r3_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r3_all_H_randomForestN5.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa_H.

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN5 | features: lco_cont_window_r3_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r3_all_H_randomForestN5.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: .

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

BLmediansamerespos lco_whole_sequence_all_H
model_name: BLmediansamerespos | features: lco_whole_sequence_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_whole_sequence_all_H_BLmediansamerespos.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN5 | features: lco_cont_window_r1_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r1_all_H_randomForestN5.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: .

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

randomForestN5 lco_cont_window_r2_all_H
model_name: randomForestN5 | features: lco_cont_window_r2_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r2_all_H_randomForestN5.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa_H.

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN5 | features: lco_cont_window_r2_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r2_all_H_randomForestN5.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: .

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]

randomForestN30 lco_cont_window_r1_all_H
model_name: randomForestN30 | features: lco_cont_window_r1_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r1_all_H_randomForestN30.p
ANARCI command: anarci -i seq_seq1.fasta -o seq_seq1_numbered --csv --scheme=aho
ROMAN newest file: seq_seq1_numbered_H.csv
newest_file_df.shape: (2, 161)
      1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147 148  \
Id                                  ...                                       
seq1  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   S   

     149  
Id        
seq1   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa

Processing individual predictions...:   0%|          | 0/328 [00:00<?, ?it/s]

FASTA file saved as seq_roman_seq.fasta
model_name: randomForestN30 | features: lco_cont_window_r1_all_H
model loaded from: ../../data/pickles/trained-test-models/lco_cont_window_r1_all_H_randomForestN30.p
ANARCI command: anarci -i seq_roman_seq.fasta -o seq_roman_seq_numbered --csv --scheme=aho
ROMAN newest file: seq_roman_seq_numbered_H.csv
newest_file_df.shape: (1, 161)
           1  2  3  4  5  6  7  8  9 10  ... 140 141 142 143 144 145 146 147  \
Id                                       ...                                   
roman_seq  -  V  Q  L  Q  E  S  -  D  A  ...   G   Q   G   T   T   L   T   V   

          148 149  
Id                 
roman_seq   S   S  

[1 rows x 149 columns]
load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path:

Processing individual predictions...:   0%|          | 0/164 [00:00<?, ?it/s]