In [None]:
import pathlib
import torch

from esm import FastaBatchedDataset, pretrained

In [None]:
import os
import pandas as pd

In [None]:
import requests

def fetch_protein_data(base_term, variants=['','a','b','c','d'], topn=200):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    # Constructing the search term with explicit variants
    all_results = []
    all_search = []
    for variant in variants:
        term = f"{base_term}{variant}"

        params = {
            'db': 'protein',
            'term': f"{term}[title]",  # Searches only in the titles
            'retmax': topn,
            'retmode': 'json'
        }

        response = requests.get(base_url, params=params)
        print(f"Status Code: {response.status_code}")
        if response.status_code == 200:
            search_data = response.json()
            all_results.extend(search_data['esearchresult']['idlist'])
            all_search.append(search_data)
    return all_results,all_search
# jj,qq  =fetch_protein_data('cas1', variants=['','a','b','c','d'], topn=200)

In [None]:
from Bio import Entrez
from Bio import SeqIO

def download_protein_sequences(protein_ids, email, output_file='protein_sequences.fasta'):
    # Set the email address to be used by NCBI for usage monitoring
    Entrez.email = email
    
    # Open the output file in write mode
    with open(output_file, 'w') as out_file:
        # Process each protein ID
        for protein_id in protein_ids:
            # Fetch the sequence from NCBI
            handle = Entrez.efetch(db='protein', id=protein_id, rettype='fasta', retmode='text')
            # Read the sequence from the handle
            record = SeqIO.read(handle, 'fasta')
            # Write the sequence to the output file
            SeqIO.write(record, out_file, 'fasta')
            # Close the handle
            handle.close()
            
    print(f"Sequences have been saved to {output_file}")

# Example usage
# protein_ids = ['YP_009724390', 'NP_000240']  # Add your list of NCBI protein IDs here
email = 'your.email@example.com'  # Replace with your email
# download_protein_sequences(xxx,  'sam.salari@roche.com','/home/salaris/esm_atlas/data/cas1/cas1.fasta')


In [None]:
caslist = ['cas1','cas2','cas3','cas4','cas5','cas6','cas7','cas8','cas9','cas10','cas11','cas12','cas13']
# caslist = ['cas9']

In [None]:
all_cas_ids = dict()
for cas in caslist:
    xx,_ = fetch_protein_data(cas,variants=['','a','b','c','d'],topn= 5000)
    all_cas_ids.update({cas:xx})

In [None]:
for cas in caslist:
    print(cas, len(all_cas_ids[cas]))

In [None]:


for cas in caslist:
    
    casfolder = f"/home/salaris/protein_model/data2/{cas}/"
    print(casfolder)
    os.makedirs(os.path.dirname(casfolder), exist_ok=True)
    
    xx = all_cas_ids[cas]
    casfasta = f'{casfolder}/{cas}_sequence.fasta'
    print(casfasta)
    download_protein_sequences(protein_ids=xx,email=  'sam.salari@roche.com',output_file=casfasta)


In [None]:
! ls /home/salaris/protein_model/data/

In [None]:
## build the training and test set:

In [None]:
data_list = []
for cas in caslist:
    
    casfolder = f"/home/salaris/protein_model/data2/{cas}/"
    print(casfolder)
  
    casfasta = f'{casfolder}/{cas}_sequence.fasta'
    # read the cas fasta file and convert it into a dataframe
    for record in SeqIO.parse(casfasta, "fasta"):
        data_list.append({"seq": str(record.seq),
                         "description": str(record.description),
                         "record_id": str(record.id),
                         "record_name": str(record.name),
                         "class": cas})
        

In [None]:
all_data_df = pd.DataFrame(data_list)

In [None]:
import datetime

# Get the current date and time
current_datetime = datetime.datetime.now()

# Format the date string
date_string = current_datetime.strftime('%Y%m%d_%H')

print(date_string)
all_data_filename = f'/home/salaris/protein_model/data2/all_data_{date_string}.csv'

In [None]:
#remove duplicates:
all_data_df = all_data_df.drop_duplicates(subset = 'seq', keep = 'first')


all_data_df.to_csv(all_data_filename,sep ='\t')

In [None]:
all_data_df.shape

## Split the dataset into train and validation set:

In [None]:
# all_data_filename = "/home/salaris/protein_model/data/all_data_20240629_09.csv"

# all_data_df = pd.read_csv(all_data_filename,sep = '\t')


In [None]:
all_data_df.shape

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Splitting the DataFrame into train and test sets
target = all_data_df['class']
train, validation = train_test_split(all_data_df, test_size=0.1, random_state=42,stratify=target)

# train and test are now DataFrames containing the split data


In [None]:
train.shape,validation.shape

In [None]:
# Use train for trainig and testing , 
# use validation data for final validation 

train_filename = all_data_filename + 'train_test.csv'
train.to_csv(train_filename, sep = '\t')

validation_filename = all_data_filename + 'tfinal_validation.csv'
validation.to_csv(validation_filename, sep = '\t')

In [None]:
(train_filename, validation_filename)

In [None]:
train


In [None]:
#this function will split the original fasta into train or validation fasta files that can be later read using the esms fasta2dataset function 
from Bio import SeqIO

def split_fasta_by_ids(fasta_file, train_ids, test_ids, train_outfile, test_outfile):
    """
    Splits a FASTA file into training and validation files based on provided lists of IDs.
    
    Parameters:
    - fasta_file: Path to the input FASTA file.
    - train_ids: List of IDs for the training set.
    - test_ids: List of IDs for the test set.
    - train_outfile: Path to the output FASTA file for the training set.
    - test_outfile: Path to the output FASTA file for the validation set.
    """
    # Read the fasta file
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    
    # Separate records based on IDs
    train_records = [record for record in records if record.id in train_ids]
    test_records = [record for record in records if record.id in test_ids]
    
    # Write the records to separate fasta files
    SeqIO.write(train_records, train_outfile, 'fasta')
    SeqIO.write(test_records, test_outfile, 'fasta')

# # Example usage
# train_ids = ['id1', 'id3']  # Example training IDs
# test_ids = ['id2', 'id4']   # Example test IDs

# split_fasta_by_ids('path/to/your/input.fasta', train_ids, test_ids, 'training.fasta', 'validation.fasta')


In [None]:
for cas in caslist:
    casfolder = f"/home/salaris/protein_model/data2/{cas}/"
    casfasta = f'{casfolder}/{cas}_sequence.fasta'
    training_fasta_file = casfolder + cas + '_training.fasta'
    validation_fasta_file = casfolder + cas + '_validation.fasta'
    train_ids = train.record_id.to_list()
    validation_ids = validation.record_id.to_list()
    split_fasta_by_ids(casfasta, train_ids, validation_ids, training_fasta_file, validation_fasta_file)
