In [5]:
import pandas as pd
import json
import os
import re
from collections import OrderedDict

# Define the directory to save the generated JSON files
GENERATED_FILES_DIR = 'generated_files'
os.makedirs(GENERATED_FILES_DIR, exist_ok=True)

# Load the Excel file
file_path = 'Genscript codon DB.xlsx'
excel_data = pd.ExcelFile(file_path)

# Function to extract and clean the species name from the sheet name
def clean_species_name(sheet_name):
    species_name = sheet_name.strip().replace(" ", "_")
    return re.sub(r'\W+', '_', species_name)

# Function to sort the triplets by amino acid and then by triplet
def sort_triplets(triplets):
    return OrderedDict(sorted(triplets.items(), key=lambda x: (x[1][0], x[0])))

# Function to process each part of the data
def process_part(part, result):
    triplet, amino_acid, fraction, frequency, number = part
    if pd.notna(triplet) and pd.notna(amino_acid):
        triplet = triplet.strip().replace("T", "U")
        amino_acid = amino_acid.strip()
        fraction = float(fraction)
        frequency = float(frequency)
        number = int(str(number).replace(',', '').strip())
        result["triplet"][triplet] = [amino_acid, fraction, frequency, number]

# Process each sheet in the Excel file
for sheet_name in excel_data.sheet_names:
    df = pd.read_excel(excel_data, sheet_name=sheet_name)

    # Initialize the result dictionary
    result = {"triplet": {}}

    # Process the data from both left and right parts
    for index, row in df.iterrows():
        left_part = row[:5]
        right_part = row[5:]

        process_part(left_part, result)
        process_part(right_part, result)

    # Sort the triplets
    sorted_result = {"triplet": sort_triplets(result["triplet"])}

    # Convert the dictionary to JSON format
    json_output = json.dumps(sorted_result, indent=2, separators=(',', ': '))

    # Generate the file name
    species_name = clean_species_name(sheet_name)
    json_file_name = f"Genscript_codon_frequency_table_{species_name}.json"
    json_path = os.path.join(GENERATED_FILES_DIR, json_file_name)

    # Save the JSON to a file
    with open(json_path, 'w') as f:
        f.write(json_output)

    print(f"Generated JSON file: {json_file_name}")

print("JSON files have been generated and saved to the 'generated_files' directory.")


Generated JSON file: Genscript_codon_frequency_table_Cricetulus_griseus__CHO_.json
Generated JSON file: Genscript_codon_frequency_table_Escherichia_coli.json
Generated JSON file: Genscript_codon_frequency_table_Yeast.json
Generated JSON file: Genscript_codon_frequency_table_Insect.json
Generated JSON file: Genscript_codon_frequency_table_C_elegans.json
Generated JSON file: Genscript_codon_frequency_table_Drosophila_melanogaster.json
Generated JSON file: Genscript_codon_frequency_table_Human.json
Generated JSON file: Genscript_codon_frequency_table_Mouse.json
Generated JSON file: Genscript_codon_frequency_table_Rat.json
Generated JSON file: Genscript_codon_frequency_table_Pig.json
Generated JSON file: Genscript_codon_frequency_table_Pichia_pastoris.json
Generated JSON file: Genscript_codon_frequency_table_Arabidopsis_thaliana.json
Generated JSON file: Genscript_codon_frequency_table_Streptomyces.json
Generated JSON file: Genscript_codon_frequency_table_Zea_mays.json
Generated JSON file: