In [1]:
import pandas as pd
import Bio as bio

In [2]:
df = pd.read_csv("in.txt", sep = '\t')

In [3]:
col1 = df["accesion"]
lista = list(col1)
lista

['ACL94627.1', 'BCM90541.1', 'BCM93164.1', 'AIR61364.1']

In [4]:
# INFORMACIÓN GENÓMICA
from Bio import SeqIO, Entrez
import pandas as pd

# Configurar el correo electrónico y la clave API de Entrez
Entrez.email = "scminat@utn.edu.ec"
Entrez.api_key = "1e71a6fbed3171a40ea7a6cd9e0ed8298009"

# Función para obtener información de la accesión de la proteína
def get_protein_info(protein_accession):
    handle = Entrez.efetch(db="protein", id=protein_accession, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    handle.close()
    
    # Inicializar variables para devolver
    genome_accession = None
    locus_tag = None
    ec_number = None
    protein_sequence = None
    
    for feature in record.features:
        if feature.type == "CDS":
            genome_accession = feature.qualifiers.get("coded_by", [None])[0]
            if genome_accession:
                genome_accession = genome_accession.split(":")[0]
            locus_tag = feature.qualifiers.get("locus_tag", [None])[0]
            protein_sequence = feature.qualifiers.get("translation", ["N/A"])[0]
            break
    
    return genome_accession, locus_tag, ec_number, protein_sequence

# Función para obtener información del genoma completo usando la accesión y el locus
def get_genome_info(genome_accession, locus_tag):
    handle = Entrez.efetch(db="nuccore", id=genome_accession, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    handle.close()
    
    gene_start = None
    gene_end = None
    nucleotide_sequence = None
    translated_protein_sequence = None
    
    for feature in record.features:
        if feature.type == "gene" and feature.qualifiers.get("locus_tag", [None])[0] == locus_tag:
            gene_start = int(feature.location.start)
            gene_end = int(feature.location.end)
            nucleotide_sequence = record.seq[gene_start:gene_end]
            break
    
    if not translated_protein_sequence:
        for feature in record.features:
            if feature.type == "CDS" and feature.qualifiers.get("locus_tag", [None])[0] == locus_tag:
                translated_protein_sequence = feature.qualifiers.get("translation", ["N/A"])[0]
                break
    
    return locus_tag, gene_start, gene_end, nucleotide_sequence, translated_protein_sequence

# Función principal para extraer y combinar datos del genoma
def extract_genome_data(protein_accessions):
    combined_data = []

    for protein_accession in protein_accessions:
        try:
            genome_accession, locus_tag, ec_number, protein_sequence = get_protein_info(protein_accession)
            
            if genome_accession and locus_tag:
                locus, gene_start, gene_end, nucleotide_sequence, translated_protein_sequence = get_genome_info(genome_accession, locus_tag)
                
                combined_entry = {
                    "Protein Accession": protein_accession,
                    "Genome Accession": genome_accession,
                    "Locus Tag": locus,
                    "Gene Start": gene_start,
                    "Gene End": gene_end,
                    "Nucleotide Sequence": str(nucleotide_sequence),
                    "Translated Protein Sequence": translated_protein_sequence
                }
                
                combined_data.append(combined_entry)
            else:
                print(f"No se encontró la accesión del genoma completo o el locus tag para {protein_accession}")
        except Exception as e:
            print(f"Error al procesar {protein_accession}: {e}")

    combined_df = pd.DataFrame(combined_data)
    combined_df.to_csv('out.csv', index=False)
    print("Datos del genoma guardados en 'out.csv'")
    return combined_df


In [5]:
# Ejemplo de uso
protein_accessions = lista  # Lista con accesiones
extract_genome_data(protein_accessions)

Datos del genoma guardados en 'out.csv'


Unnamed: 0,Protein Accession,Genome Accession,Locus Tag,Gene Start,Gene End,Nucleotide Sequence,Translated Protein Sequence
0,ACL94627.1,CP001340.1,CCNA_01162,1269937,1272241,ATGGTTTTGAAGACGAAGACCGGCGGCCCCGTGGGGTCGAGCCGAC...,MVLKTKTGGPVGSSRRGFLSGAAALAGLAVTAPPAFARASGRIEAL...
1,BCM90541.1,AP024152.1,IAD21_02395,2687530,2688790,ATGTCATCATCTATCTTCGAGTGCGAACACGCACCGCCGACAACCA...,MSSSIFECEHAPPTTNSLFKSFWMGGFECATHRRHDGRRLDCTAST...
2,BCM93164.1,AP024152.1,IAD21_05052,5785902,5787246,ATGGCCAAACAATTTCTGGACATCGTGAAAAAAAAGCTCGGCGAGA...,MAKQFLDIVKKKLGESDYSGDEHGGASGTDGSGLPDSYPGNFLFAT...
3,AIR61364.1,CP009458.1,LH23_12085,2535111,2537409,ATGAAATGGCTCTGTTCTGTAGGGATTGCCGTAAGCCTGGCGCTGC...,MKWLCSVGIAVSLALQPALADEMFGPHPLTPEARDAYVTDLLKKMT...
