In [1]:
import pandas as pd

# Function to read a PTT file
def read_ptt_data(file_path):
    with open(file_path, 'rt') as file:
        gene_data = pd.read_csv(file, sep='\t', skiprows=3, header=None, names=[
            "Locations", "Directions", "Sizes", "ProteinIDs", "GeneNames", "Synonyms", "Codes", "COGs", "Descriptions"])
    return gene_data


# Function to find operons
def find_operons(ptt_df):
    operon_list = []
    tmp_operon = [ptt_df.iloc[0]]
    
    for i in range(1, len(ptt_df)):
        previous_gene = ptt_df.iloc[i-1]
        current_gene = ptt_df.iloc[i]

        end_prev = int(previous_gene['Locations'].split('..')[-1])
        start_curr = int(current_gene['Locations'].split('..')[0])
        
        # Add to current operon if genes are adjacent and on the same strand
        if current_gene['Directions'] == previous_gene['Directions'] and (start_curr - end_prev) <= 50:
            tmp_operon.append(current_gene)
        else:

            if len(tmp_operon) > 1:
                operon_list.append(tmp_operon)
            tmp_operon = [current_gene]
    
    # Check for any remaining operon
    if len(tmp_operon) > 1:
        operon_list.append(tmp_operon)
        
    return operon_list

# Function to process PTT files
def process_ptt_files(file_paths):
    for path in file_paths:
        ptt_df = read_ptt_data(path)
        operons_found = find_operons(ptt_df)
        print(f"Found {len(operons_found)} operons in {path}.")
        for i, operon in enumerate(operons_found, start=1):
            print(f"Operon {i}: {', '.join(gene['GeneNames'] for gene in operon if gene['GeneNames'] != '-')}")

# list of PTT file paths
ptt_file_paths = [
    "Halobacterium_NRC1.ptt",
    "Synechocystis_PCC6803_uid159873.ptt",
    "B_subtilis_168.ptt",
    "E_coli_K12_MG1655.ptt"
]

# Process and print operons for each PTT file
process_ptt_files(ptt_file_paths)

# Function to read a GFF file
def read_gff_data(gff_path):
    gff_records = []
    with open(gff_path, 'r') as file:
        for line in file:
            if line.startswith('#') or not line.strip():
                continue
            fields = line.strip().split('\t')
            # Parse attributes into a dictionary
            attribute_dict = dict(item.split('=') for item in fields[8].split(';') if item)
            gff_records.append([fields[0], int(fields[3]), int(fields[4]), fields[6], attribute_dict.get('ID', 'Unknown'), attribute_dict.get('locus_tag', 'Unknown'), attribute_dict.get('product', 'Unknown')])
    gff_df = pd.DataFrame(gff_records, columns=['SeqID', 'Start', 'End', 'Strand', 'ID', 'LocusTag', 'Product'])
    return gff_df

# Function to identify operons from GFF data
def find_operons_in_gff(gff_df):
    gff_operons = []
    current_gff_operon = [gff_df.iloc[0]]
    
    for index in range(1, len(gff_df)):
        prev_gene = gff_df.iloc[index-1]
        curr_gene = gff_df.iloc[index]

        if curr_gene['Strand'] == prev_gene['Strand'] and (curr_gene['Start'] - prev_gene['End']) <= 50:
            current_gff_operon.append(curr_gene)
        else:
            
            if len(current_gff_operon) > 1:
                gff_operons.append(current_gff_operon)
                current_gff_operon = [curr_gene]

    # Check for the last operon
    if len(current_gff_operon) > 1:
        gff_operons.append(current_gff_operon)

    return gff_operons

# Function to print the operons found in a GFF file
def display_operons_gff(operons):
    print(f"Found {len(operons)} operons.")
    for i, operon in enumerate(operons, start=1):
        operon_genes = ', '.join([gene['LocusTag'] + " (" + gene['Product'] + ")" for gene in operon])
        print(f"Operon {i}: {operon_genes}")

# Function to process a GFF file and find operons
def process_gff_file(gff_file_path):
    gff_df = read_gff_data(gff_file_path)
    operons_gff = find_operons_in_gff(gff_df)
    display_operons_gff(operons_gff)

gff_file_path = "2088090036.gff"

# Call the function to process the GFF file
process_gff_file(gff_file_path)




Found 396 operons in Halobacterium_NRC1.ptt.
Operon 1: yvrO
Operon 2: glmS, graD5, graD2
Operon 3: 
Operon 4: 
Operon 5: 
Operon 6: 
Operon 7: 
Operon 8: 
Operon 9: 
Operon 10: 
Operon 11: 
Operon 12: moeA2, moeA1
Operon 13: pimT1
Operon 14: rmeM, rmeS
Operon 15: rmeR
Operon 16: 
Operon 17: 
Operon 18: trp1
Operon 19: 
Operon 20: hpyA, aup
Operon 21: 
Operon 22: 
Operon 23: 
Operon 24: kdgK, mutL
Operon 25: gdhB, alkK
Operon 26: cat1, trkA1
Operon 27: lpl, xthA
Operon 28: pepB1
Operon 29: 
Operon 30: 
Operon 31: 
Operon 32: 
Operon 33: 
Operon 34: guaAa
Operon 35: 
Operon 36: rpl37ae
Operon 37: truD
Operon 38: dcd
Operon 39: fbr
Operon 40: 
Operon 41: 
Operon 42: 
Operon 43: 
Operon 44: 
Operon 45: crtI3
Operon 46: 
Operon 47: 
Operon 48: 
Operon 49: 
Operon 50: trpC, trpB, trpA
Operon 51: ids, pykA
Operon 52: gadD, caaX, ppsA
Operon 53: smc1
Operon 54: 
Operon 55: 
Operon 56: nfi
Operon 57: 
Operon 58: nusG, secE
Operon 59: ftsZ1
Operon 60: aroE, trpE2, trpG2, ilvE1
Operon 61: 
Operon