In [1]:
from enlarge_msa import blast
from enlarge_msa import mmseqs2

# Prepare a DataFrame from an Alphafold input CSV file

In [None]:
AF_df=blast.process_csv("b.csv")

# Importing the used Database for blast search


In [None]:
db="/shared/banks/uniref90/uniref90.fasta"

# run BLASTP searches for sequences in a DataFrame


# Steps:

1. **Prepare FASTA Files**
    - Create FASTA files for each receptor (rec) and peptide (pep) sequence.

2. **Run BLASTp**
    - Execute BLASTp for each FASTA file.
    - Save the outputs in CSV format.

3. **Run BLAST Analysis to Prepare MMseq DataFrame**
    - For each CSV file:
        - Add the sequences for each retrieved ID using `blastdbcmd`.
    - Optional:
        - Save the modified DataFrames for the receptor or peptide using `generate_tmp_pep_df` and `generate_tmp_rec_df`: **(original blast results + seqs of each id ( blastdbcmd))**
    - Search for the sequence with maximum coverage and length for each receptor and peptide sequence.
    - Prepare the MMseq DataFrame:
        - For each sequence, report the original receptor or peptide sequence along with the newly retrieved receptor or peptide sequence.


# We Need to Specify:

- **DataFrame**: The Alphafold DataFrame prepared using process_csv function.
- **Database**: The database to use for the BLASTp search.
- **E-value Cutoff**: The threshold for the E-value to filter the BLASTp results.
- **Number of Threads**: The number of threads to use for the BLASTp search.
- **Peptide or Receptor**: Indicate whether the BLASTp search is being performed on peptide sequences or receptor sequences.
- **Max Runs**: The maximum number of runs to perform.


In [None]:
blast.blastp(AF_df, db, 10 ,12 ,pep=True, rec=True)

- **The Outputs** of blast results will be in csv format

# Perform BLAST analysis

# We Need to Specify:

- **DataFrame**: The Alphafold DataFrame prepared using process_csv function.
- **Database**: The database to use for the BLASTp search.
- **peptide or receptor** : Indicate whether the blast analysis and preparation of mmseq df will be performed on peptide sequences or receptor sequences.
- **generate temporary df** : whether we want to save the treated dataframe of each seq : original blast results + seqs of each id ( blastdbcmd)
- **generate_mmseq_df** : whether we want to save the mmseq df in csv format

In [None]:
mmseq_df=blast.blast_analysis(df,db ,pep=True, rec=True , generate_tmp_pep_df =False,generate_tmp_rec_df =False)

- **output** : DataFrame containing containing the orginal and the new sequences of receptor and peptide for each pdb strucutre

# Run mmseqs2

In [None]:
for i, (pdb, original_rec_seq, original_pep_seq, new_rec_seq, new_pep_seq) in enumerate(zip(mmseq_df["pdb"], mmseq_df["original_rec_seq"], mmseq_df["original_pep_seq"], mmseq_df["new_rec_seq"], mmseq_df["new_pep_seq"])):
    # checking if both sequences are non-empty
    if not (pd.isna(new_rec_seq) or pd.isna(new_pep_seq)):
        
            # original_seq_list :=[ original_rec_seq , original_pep_seq ]
        
            original_seq_list = original_rec_seq.strip("[]").replace("'","").split(", ")
            original_seq_list.append(original_pep_seq)
        
            # new_seq_list :=[ new_rec_seq , new_pep_seq ]
            new_seq_list = new_rec_seq.strip("[]").replace("'","").split(", ")
            new_seq_list.append(new_pep_seq)
        
            for i in range(len(new_seq_list)):
                # Insertion of original sequences (receptor and peptide ) in the new sequence 
                new_seq_list[i] = mmseqs2.insert_seq(original_seq_list[i], new_seq_list[i])
                
            # run mmseqs2 
            mmseqs2.create_full_alignement(
                pdb,
                original_seq_list,
                new_seq_list,
                out_dir="tmp",
            )



**outputs** : unpaired and paired alignements ===> run Alphafold

# Other hidden options

### Run blastp for fasta file

****Simple BLASTp Search****

You can run a simple BLASTp search starting from a FASTA file using the `blastp_launch` function. The function takes the following parameters:

- **fasta**: Path to the input FASTA file.
- **db**: The BLAST database to use for the search.
- **output**:  output file where the results will be saved.
- **num_threads**: The number of threads to use for the BLASTp search.
- **evalue**: The E-value cutoff for filtering the results.
- **Max Runs**: The maximum number of runs to perform.


In [None]:
fasta_path="folder1/seq.fa"
db="/shared/banks/uniref90/uniref90.fasta"
blast.blasp_launch(fasta_path,db , "out_seq_.csv" , 10,10 , max_runs=5)

### Run blastdbcmd : Fetch an amino acid sequence from the database

****Retrieve Sequences Using `blastdbcmd`****

You can retrieve sequences from a BLAST database using the `blastdbcmd` function. This function takes the following parameters:

- **query**: The query identifier(s) for the sequences to be retrieved.
- **db**: The BLAST database from which to retrieve the sequences.


In [None]:
#example : 
query="UniRef100_Q8DI95"
db="/shared/banks/uniref90/uniref90.fasta"
seq=blast.blastdbcmd(query, db):
