# %% [markdown]
# ## 1. Processamento do Arquivo GFF
# Extrair coordenadas genômicas (start, end) para os genes filtrados

In [15]:
# %%
import pandas as pd

# Carregar o arquivo GFF (substitua pelo seu caminho)
gff_path = "/Users/rvincenzi/Documents/USP/Doutorado/Tese/genomas_geral/genomica_comparativa/eggnog/p4d.fna.gff"
gff_cols = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
gff_df = pd.read_csv(gff_path, sep="\t", comment="#", header=None, names=gff_cols)

# Filtrar apenas genes (ajuste 'type' conforme seu GFF)
gene_df = gff_df[gff_df["type"].isin(["gene", "CDS"])].copy()

# Extrair o ID do gene dos atributos (ex: "ID=gene0001;Name=kdpA")
gene_df["gene_id"] = gene_df["attributes"].str.extract(r"ID=([^;]+)")

# Mostrar estrutura
print(f"Total de genes no GFF: {len(gene_df)}")
gene_df.head(3)

Total de genes no GFF: 3362


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,gene_id
0,NODE_1_length_1650131_cov_5.385719,Prodigal:002006,CDS,330.0,1175.0,.,-,0,ID=JDFEPHJD_00001;inference=ab initio predicti...,JDFEPHJD_00001
1,NODE_1_length_1650131_cov_5.385719,Prodigal:002006,CDS,1310.0,2326.0,.,-,0,ID=JDFEPHJD_00002;eC_number=1.1.1.86;Name=ilvC...,JDFEPHJD_00002
2,NODE_1_length_1650131_cov_5.385719,Prodigal:002006,CDS,2494.0,3375.0,.,+,0,ID=JDFEPHJD_00003;Name=oxyR_1;gene=oxyR_1;infe...,JDFEPHJD_00003


# %% [markdown]
# ## 2. Cruzamento com os Genes de Interesse
# Vincular as coordenadas aos genes salt-in/salt-out

In [16]:
# %%
# Carregar sua lista filtrada (resultado do passo anterior)
filtered_genes = pd.read_csv("p4d_genes_salt_filtrados.csv")

# Juntar com o GFF usando o ID do gene
merged_df = pd.merge(
    filtered_genes,
    gene_df[["gene_id", "seqid", "start", "end", "strand"]],
    left_on="query",  # Coluna com IDs no seu arquivo filtrado
    right_on="gene_id",
    how="left"
)

# Verificar genes sem coordenadas
missing_coords = merged_df[merged_df["start"].isna()]
print(f"\nGenes sem coordenadas no GFF: {len(missing_coords)}")
if len(missing_coords) > 0:
    print(missing_coords[["query", "Preferred_name"]].head())


Genes sem coordenadas no GFF: 0


In [17]:
# %%
# Criar arquivo no formato esperado pelo Proksee
proksee_df = merged_df.dropna(subset=["start"]).copy()
proksee_df["color"] = proksee_df["strategy"].map({
    "salt-in": "#3498db",  # Azul
    "salt-out": "#e74c3c"  # Vermelho
})
proksee_df["label"] = proksee_df["gene_found"]  # Ou use "Preferred_name"

# Selecionar colunas e salvar
proksee_export = proksee_df[["gene_id", "seqid", "start", "end", "strand", "color", "label"]]
proksee_export.to_csv("p4d_proksee_salt_genes.tsv", sep="\t", index=False)

print(f"✅ Arquivo gerado")
print("Exemplo das primeiras linhas:")
proksee_export.head()

✅ Arquivo gerado
Exemplo das primeiras linhas:


Unnamed: 0,gene_id,seqid,start,end,strand,color,label
0,JDFEPHJD_00029,NODE_1_length_1650131_cov_5.385719,30539.0,31675.0,-,#e74c3c,proB
1,JDFEPHJD_00399,NODE_1_length_1650131_cov_5.385719,431809.0,433095.0,-,#e74c3c,proA
2,JDFEPHJD_00429,NODE_1_length_1650131_cov_5.385719,461140.0,461748.0,+,#e74c3c,betI
3,JDFEPHJD_00430,NODE_1_length_1650131_cov_5.385719,461790.0,463274.0,+,#e74c3c,betB
4,JDFEPHJD_00999,NODE_1_length_1650131_cov_5.385719,1134976.0,1137774.0,+,#3498db,nhAB
