Help me write an meta info reader in python. Use a class to define the meta info reader. The class should have
the following methods:
- While read single meta file, set the name to None
- value:name. method to create name of a meta file with its properties, for example: If the meta file have properity num_aln, taxa_name, and I could use set_name(self, rule), while rule is "{num_aln}_{taxa_name}"
- value:path. when appling the class, record the absolute path of the meta file.
Another class to manage a set of meta files:
- method to link a list of meta info files
- method to read specific property from all meta files to form a dictionary
- method to read specific properties from all meta files to form a table
- method to rejoin the meta files into a new meta file, with every single properity was mapping into a dictionary with {metafilename: metafilevalue} (But here we don't require all the meta files to have the same propertiy, so just join that with existing values, and also have a parameter "force" to set those not existing meta file's value to None. And a parameter "record_path", to add value:path as a new property of the metafile). At the end, the new metafile's format would like {properity1:{metafile1:properity1_value_in_metafile1, metafile2:properity1_value_in_metafile2,metafile3:properity1_value_in_metafile3}, properity2:{metafile1:properity2_value_in_metafile1, metafile2:properity2_value_in_metafile2,metafile3:properity2_value_in_metafile3}}

In [59]:
import os
import json

class MetaInfoReader:
    def __init__(self, file_path):
        self.file_path = os.path.abspath(file_path)
        self.name = None
        self.properties = self._read_properties()

    def _read_properties(self):
        # Read the JSON file and load it as a dictionary
        with open(self.file_path, 'r') as file:
            properties = json.load(file)
        return properties

    def set_name(self, rule):
        try:
            self.name = rule.format(**self.properties)
        except KeyError as e:
            print(f"Error: Missing property {e} for naming rule.")

    def get_property(self, property_name):
        return self.properties.get(property_name)

    def __repr__(self):
        return f"MetaInfoReader(name={self.name}, path={self.file_path})"

class MetaInfoManager:
    def __init__(self):
        self.meta_files = []

    def add_meta_file(self, meta_info_reader):
        self.meta_files.append(meta_info_reader)

    def add_meta_files(self, meta_info_readers):
        self.meta_files.extend(meta_info_readers)

    def get_property_dict(self, property_name):
        property_dict = {}
        for meta_file in self.meta_files:
            property_dict[meta_file.name] = meta_file.get_property(property_name)
        return property_dict

    def get_property_table(self, property_names):
        table = {}
        for meta_file in self.meta_files:
            row = {prop: meta_file.get_property(prop) for prop in property_names}
            table[meta_file.name] = row
        return table

    def rejoin_meta_files(self, force=False, record_path=False):
        merged_properties = {}
        for meta_file in self.meta_files:
            for key, value in meta_file.properties.items():
                if key not in merged_properties:
                    merged_properties[key] = {}
                merged_properties[key][meta_file.name] = value
    
        # Set missing properties to None if force=True
        if force:
            all_keys = set(merged_properties.keys())
            for meta_file in self.meta_files:
                for key in all_keys:
                    if key not in meta_file.properties:
                        merged_properties[key][meta_file.name] = None
    
        if record_path:
            merged_properties['path'] = {}
            for meta_file in self.meta_files:
                merged_properties['path'][meta_file.name] = meta_file.file_path


        return merged_properties


In [60]:
file1 = MetaInfoReader('/home/tim/project/bacterialQ/Result_nova/method_test/p__Acidobacteriota/free_subtree_topo/p__Acidobacteriota_100_1500/meta.json')
file2 = MetaInfoReader('/home/tim/project/bacterialQ/Result_nova/method_test/p__Acidobacteriota/free_subtree_topo/p__Acidobacteriota_150_1000/meta.json')

# Set a custom name for the meta file using its properties
file1.set_name("{num_aln}_{taxa_name}")
file2.set_name("{num_aln}_{taxa_name}")


In [61]:
metaset = MetaInfoManager()
metaset.add_meta_files([file1, file2])

In [62]:
metaset.rejoin_meta_files(force=True, record_path=True)

{'num_aln': {'1500_p__Acidobacteriota': 1500, '1000_p__Acidobacteriota': 1000},
 'prop_aln': {'1500_p__Acidobacteriota': None,
  '1000_p__Acidobacteriota': None},
 'taxa_name': {'1500_p__Acidobacteriota': 'p__Acidobacteriota',
  '1000_p__Acidobacteriota': 'p__Acidobacteriota'},
 'taxa_scale': {'1500_p__Acidobacteriota': 'phylum',
  '1000_p__Acidobacteriota': 'phylum'},
 'train_loc_path': {'1500_p__Acidobacteriota': '/home/tim/project/bacterialQ/alignment/r220/train',
  '1000_p__Acidobacteriota': '/home/tim/project/bacterialQ/alignment/r220/train'},
 'test_loc_path': {'1500_p__Acidobacteriota': '/home/tim/project/bacterialQ/alignment/r220/test',
  '1000_p__Acidobacteriota': '/home/tim/project/bacterialQ/alignment/r220/test'},
 'taxa_file': {'1500_p__Acidobacteriota': '/home/tim/project/bacterialQ/data/r220/combined_df.csv',
  '1000_p__Acidobacteriota': '/home/tim/project/bacterialQ/data/r220/combined_df.csv'},
 'ref_tree': {'1500_p__Acidobacteriota': '/home/tim/project/bacterialQ/data/r

In [106]:
import os

def find_meta_json_files(directory):
    meta_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == "meta.json":
                meta_files.append(os.path.join(root, file))
    return meta_files

# directory_path = "/home/tim/project/bacterialQ/Result_nova/method_test/p__Gemmatimonadota"
# directory_path = "/home/tim/project/bacterialQ/Result_nova/method_test/p__Acidobacteriota"
directory_path = "/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota"
meta_files = find_meta_json_files(directory_path)

In [107]:
meta_files

['/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_100_1000/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_100_1500/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_150_1500/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_50_2000/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_200_1500/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_150_2000/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_100_2000/meta.json',
 '/home/tim/project/bacterialQ/Result_nova/method_test/p__Spirochaetota/fix_subtree_topo/p__Spirochaetota_200_10

In [108]:
def convert_suffix(s):
    s_lower = s.lower()
    if s_lower.endswith("_false"):
        return s[:-6] + "_nc"
    elif s_lower.endswith("_true"):
        return s[:-5] + "_tc"
    return s


In [109]:
meta_store = MetaInfoManager()
for meta_file in meta_files:
    reader = MetaInfoReader(meta_file)
    reader.set_name("{taxa_name}_{tree_size_upper_lim}_{num_aln}_{fix_subtree_topology}")
    print(reader.name)
    reader.name = convert_suffix(reader.name)
    meta_store.add_meta_file(reader)

p__Spirochaetota_100_1000_True
p__Spirochaetota_100_1500_True
p__Spirochaetota_150_1500_True
p__Spirochaetota_50_2000_True
p__Spirochaetota_200_1500_True
p__Spirochaetota_150_2000_True
p__Spirochaetota_100_2000_True
p__Spirochaetota_200_1000_True
p__Spirochaetota_150_1000_True
p__Spirochaetota_50_1000_True
p__Spirochaetota_50_1500_True
p__Spirochaetota_100_1000_False
p__Spirochaetota_100_1500_False
p__Spirochaetota_150_1500_False
p__Spirochaetota_50_2000_False
p__Spirochaetota_200_1500_False
p__Spirochaetota_100_2000_False
p__Spirochaetota_200_1000_False
p__Spirochaetota_150_1000_False
p__Spirochaetota_50_1000_False
p__Spirochaetota_50_1500_False


In [110]:
joined_meta = meta_store.rejoin_meta_files(record_path=True)

In [111]:
sorted(joined_meta['final_tree_ll'].items(), key=lambda x: x[1], reverse=True)

[('p__Spirochaetota_200_1000_nc', -25479754.099),
 ('p__Spirochaetota_200_1500_tc', -25479987.704),
 ('p__Spirochaetota_200_1000_tc', -25480587.968),
 ('p__Spirochaetota_150_1000_tc', -25483736.982),
 ('p__Spirochaetota_150_1000_nc', -25484320.421),
 ('p__Spirochaetota_150_1500_nc', -25485311.1),
 ('p__Spirochaetota_150_2000_tc', -25485923.432),
 ('p__Spirochaetota_150_1500_tc', -25487650.428),
 ('p__Spirochaetota_100_2000_nc', -25493734.457),
 ('p__Spirochaetota_100_1500_nc', -25494896.521),
 ('p__Spirochaetota_100_1000_tc', -25494948.38),
 ('p__Spirochaetota_100_2000_tc', -25495048.729),
 ('p__Spirochaetota_100_1000_nc', -25495826.082),
 ('p__Spirochaetota_100_1500_tc', -25496303.506),
 ('p__Spirochaetota_50_1000_tc', -25505486.838),
 ('p__Spirochaetota_50_2000_nc', -25506645.382),
 ('p__Spirochaetota_50_2000_tc', -25507674.856),
 ('p__Spirochaetota_50_1500_nc', -25509580.266),
 ('p__Spirochaetota_50_1500_tc', -25510269.968),
 ('p__Spirochaetota_50_1000_nc', -25512070.159)]

In [112]:
sorted(joined_meta['training_tree_ll'].items(), key=lambda x: x[1], reverse=True)

[('p__Spirochaetota_200_1500_nc', [-21396967.716]),
 ('p__Spirochaetota_200_1000_nc', [-21398173.195]),
 ('p__Spirochaetota_200_1500_tc', [-21398418.231]),
 ('p__Spirochaetota_200_1000_tc', [-21398950.31]),
 ('p__Spirochaetota_150_1500_nc', [-21402845.881]),
 ('p__Spirochaetota_150_2000_tc', [-21403441.208]),
 ('p__Spirochaetota_150_1500_tc', [-21404773.581]),
 ('p__Spirochaetota_150_1000_nc', [-21405333.284]),
 ('p__Spirochaetota_150_1000_tc', [-21406358.149]),
 ('p__Spirochaetota_100_2000_nc', [-21410041.507]),
 ('p__Spirochaetota_100_1500_nc', [-21410901.303]),
 ('p__Spirochaetota_100_2000_tc', [-21411214.471]),
 ('p__Spirochaetota_100_1500_tc', [-21411515.648]),
 ('p__Spirochaetota_100_1000_nc', [-21412363.398]),
 ('p__Spirochaetota_100_1000_tc', [-21413688.52]),
 ('p__Spirochaetota_50_1500_nc', [-21421332.082]),
 ('p__Spirochaetota_50_1500_tc', [-21421437.393]),
 ('p__Spirochaetota_50_2000_nc', [-21421600.723]),
 ('p__Spirochaetota_50_1000_tc', [-21421685.71]),
 ('p__Spirochaetota

In [113]:
import pandas as pd 

In [114]:
df = pd.DataFrame(joined_meta['test_partition_result']).T

# 重置索引，将原索引变为'name'列
df.reset_index(inplace=True)
df.rename(columns={'index': 'name'}, inplace=True)

In [115]:
df[['name','log_likelihood', 'BIC', 'cpu_time']].sort_values(by='BIC', ascending=True)

Unnamed: 0,name,log_likelihood,BIC,cpu_time
5,p__Spirochaetota_200_1000_nc,-4045798.674,8118915.054,771452.0
0,p__Spirochaetota_100_1000_tc,-4046837.1424,8121106.6599,1565570.0
3,p__Spirochaetota_100_1000_nc,-4046853.0081,8121120.75,187821.0
4,p__Spirochaetota_100_1500_nc,-4047006.1517,8121321.1887,1507950.0
1,p__Spirochaetota_50_1000_tc,-4047605.6552,8122626.0441,1123650.0
6,p__Spirochaetota_50_1000_nc,-4047905.9104,8123226.5545,1233980.0
7,p__Spirochaetota_50_1500_nc,-4048052.7567,8123414.3988,686475.0
2,p__Spirochaetota_50_1500_tc,-4048171.563,8123616.7285,922102.0


In [116]:
joined_meta['final_test_partition']

{'p__Spirochaetota_100_1000_tc': {'p__Spirochaetota_100_1000_2': 14,
  'Q.PFAM': 3,
  'Q.INSECT': 3},
 'p__Spirochaetota_50_1000_tc': {'p__Spirochaetota_50_1000_2': 13,
  'Q.PFAM': 3,
  'Q.INSECT': 3,
  'Q.YEAST': 1},
 'p__Spirochaetota_50_1500_tc': {'p__Spirochaetota_50_1500_2': 13,
  'Q.PFAM': 3,
  'Q.INSECT': 3,
  'Q.YEAST': 1},
 'p__Spirochaetota_100_1000_nc': {'p__Spirochaetota_100_1000_2': 14,
  'Q.PFAM': 3,
  'Q.INSECT': 3},
 'p__Spirochaetota_100_1500_nc': {'p__Spirochaetota_100_1500_2': 14,
  'Q.PFAM': 3,
  'Q.INSECT': 3},
 'p__Spirochaetota_200_1000_nc': {'p__Spirochaetota_200_1000_1': 15,
  'Q.PFAM': 3,
  'Q.INSECT': 2},
 'p__Spirochaetota_50_1000_nc': {'p__Spirochaetota_50_1000_2': 13,
  'Q.PFAM': 3,
  'Q.INSECT': 3,
  'Q.YEAST': 1},
 'p__Spirochaetota_50_1500_nc': {'p__Spirochaetota_50_1500_2': 13,
  'Q.PFAM': 3,
  'Q.INSECT': 3,
  'Q.YEAST': 1}}

In [117]:
{key1:{name1:value1, name2:value2}, key2:{name1:value1, name2:value2}, key3:{name1:dict1, name2:dict2}} 
Where dict 1 / dict 2 may like:
{dict_subkey1: value3, dict_subkey2: value4, dict_sub1:dict_sub1...}

--> pandas sheet
|name | key1 | key2 | dict1.key()_dict_subkey1| dict1.key()_dict_subkey2| ...|
|name1|value1|value1| value3                 | value4                 | ...|
|name2|value2|value2| value3                 | value4                 | ...|

That is to recursively flatten the nested dictionary into a pandas sheet, and represent the name of the variable as dictkey_subdictkey...
But only flatten that the values is dict, float, int, str, set, bool, None, but not list.

SyntaxError: invalid syntax (2092509985.py, line 1)

In [119]:
import pandas as pd

def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, (float, int, str, set, bool, type(None))):
            items.append((new_key, v))
        # Skip lists
    return dict(items)

def nested_dict_to_dataframe(nested_dict):
    flattened_data = {}
    for key, sub_dict in nested_dict.items():
        for sub_key, value in sub_dict.items():
            if sub_key not in flattened_data:
                flattened_data[sub_key] = {}
            flattened_data[sub_key][key] = flatten_dict(value)
    
    # Convert to DataFrame
    df = pd.DataFrame(flattened_data).T
    return df

# Example usage
nested_dict = {
    'key1': {'name1': 'value1', 'name2': 'value2'},
    'key2': {'name1': 'value1', 'name2': 'value2'},
    'key3': {'name1': {'dict_subkey1': 'value3', 'dict_subkey2': 'value4'}, 'name2': {'dict_subkey1': 'value3', 'dict_subkey2': 'value4'}}
}

df = nested_dict_to_dataframe(nested_dict)
print(df)

AttributeError: 'str' object has no attribute 'items'