In [37]:
import pathlib
import torch

from esm import FastaBatchedDataset, pretrained


# lets do PLS
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [76]:
import numpy as np

In [38]:
model_name = 'esm2_t33_650M_UR50D'

In [96]:
import os
import torch
import pandas as pd

def load_pt_files_to_dataframe(folder_path,max_samples):
    # Initialize an empty DataFrame
    df = pd.DataFrame(columns=["entry_id", "label", "mean_representation"])
    n = 0
    # Loop through all files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".pt"):
            # Construct full file path
            file_path = os.path.join(folder_path, filename)
            # Load the PyTorch file (.pt)
            data = torch.load(file_path)
            # Append data to the DataFrame
            newrow = pd.DataFrame({
                "entry_id": data["entry_id"],
                "label": data["label"],
                "mean_representation": [np.array(data["mean_representations"][33].tolist())]
            })
            df = pd.concat([df,newrow], ignore_index=True)
            n += 1 
        if n> max_samples:
            break
    print(f"number of samples process for {cas}: {n}")
    return df

# Example usage
#folder_path = path
#dataframe = load_pt_files_to_dataframe(folder_path)
# print(dataframe)


In [97]:
caslist = ['cas1','cas2','cas3','cas4','cas5','cas6','cas7','cas8','cas9','cas10','cas11','cas12','cas13']


In [116]:
training_list = []
validation_list =[]
for cas in caslist: 
    
    casfolder = f"/home/salaris/protein_model/data/{cas}/"
    
    training_fasta_file = pathlib.Path(casfolder + cas + '_training.fasta')
    validation_fasta_file = pathlib.Path(casfolder + cas + '_validation.fasta')
    
    training_embedding_folder = pathlib.Path(casfolder  + "_" +model_name + "_" + 'embeddings/' +  'training/')
    validation_embedding_folder = pathlib.Path(casfolder  + "_" +model_name + "_" + 'embeddings/' +   'validation/')
    print(training_embedding_folder, validation_embedding_folder)
    print(training_fasta_file, validation_fasta_file)
   

    _training_df = load_pt_files_to_dataframe(training_embedding_folder,max_samples=1000)
    _validation_df = load_pt_files_to_dataframe(validation_embedding_folder,max_samples=1000)
    training_list.append(_training_df)
    validation_list.append(_validation_df)
    training_df = pd.concat(training_list, ignore_index = True )
    validation_df = pd.concat(validation_list, ignore_index = True )

/home/salaris/protein_model/data/cas1/_esm2_t33_650M_UR50D_embeddings/training /home/salaris/protein_model/data/cas1/_esm2_t33_650M_UR50D_embeddings/validation
/home/salaris/protein_model/data/cas1/cas1_training.fasta /home/salaris/protein_model/data/cas1/cas1_validation.fasta
number of samples process for cas1: 1001
number of samples process for cas1: 1001
/home/salaris/protein_model/data/cas2/_esm2_t33_650M_UR50D_embeddings/training /home/salaris/protein_model/data/cas2/_esm2_t33_650M_UR50D_embeddings/validation
/home/salaris/protein_model/data/cas2/cas2_training.fasta /home/salaris/protein_model/data/cas2/cas2_validation.fasta
number of samples process for cas2: 1001
number of samples process for cas2: 500
/home/salaris/protein_model/data/cas3/_esm2_t33_650M_UR50D_embeddings/training /home/salaris/protein_model/data/cas3/_esm2_t33_650M_UR50D_embeddings/validation
/home/salaris/protein_model/data/cas3/cas3_training.fasta /home/salaris/protein_model/data/cas3/cas3_validation.fasta
num

# build the model:

In [117]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,balanced_accuracy_score,f1_score

from sklearn.preprocessing import LabelEncoder


# Example data loading
# Assuming 'df' is your DataFrame containing the features and the label
# df = pd.read_csv('path_to_your_data.csv')  # Load your data here

# Splitting the data into features and target
# Splitting the data into features and target
X = np.array(training_df['mean_representation'].tolist())
y = training_df['label']

# Encoding categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a Gradient Boosting Classifier
# gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# modelx = GradientBoostingClassifier()
modelx = RandomForestClassifier()

# Train the model
modelx.fit(X_train, y_train)

# Predict on the testing set
y_pred = modelx.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
bacc = balanced_accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred,average='weighted')
print(f'Accuracy: {accuracy}')
print(f'balanced accuracy:{bacc}' )
print(f'f1 score:{f1}' )

print('Confusion Matrix:')
print(conf_mat)


Accuracy: 0.9837925445705025
balanced accuracy:0.97849752899133
f1 score:0.9838121116051421
Confusion Matrix:
[[191   0   0   0   0   1   0   0   0   0   2   0   0]
 [  1  17   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0  16   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  26   0   0   0   0   0   0   1   0   0]
 [  0   0   0   0 134   0   0   1   0   0   0   0   0]
 [  0   0   0   0   0 197   0   0   0   0   2   0   0]
 [  0   0   0   0   0   0 198   4   0   0   0   0   0]
 [  2   0   0   0   0   1   2 186   0   0   0   0   0]
 [  0   0   0   0   0   2   0   0 206   0   0   0   0]
 [  0   0   0   0   1   2   0   0   0 200   0   0   0]
 [  0   0   0   0   1   2   0   0   0   1 204   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  45   0]
 [  0   0   0   0   3   0   0   0   0   0   0   0 201]]
