In [5]:
from pathlib import Path
import pandas as pd
import os

def describe_directory(base_path, preview_limit=0):
    base_path = Path(base_path).resolve()
    print(f"🔍 Scanning: {base_path}\n")

    all_files = sorted(base_path.rglob("*"))
    file_data = []

    for f in all_files:
        if f.is_file():
            size_kb = f.stat().st_size / 1024
            rel_path = f.relative_to(base_path)
            file_data.append({
                "Path": str(rel_path),
                "Size (KB)": round(size_kb, 2),
                "Extension": f.suffix.lower()
            })
    
    df = pd.DataFrame(file_data)
    if df.empty:
        print("🚫 No files found.")
    else:
        display(df)

    # Optional preview of CSV/XLSX files
    if preview_limit > 0:
        for file in df[df['Extension'].isin(['.csv', '.xlsx'])]['Path']:
            abs_path = base_path / file
            print(f"\n📄 Preview: {file}")
            try:
                if file.endswith('.csv'):
                    preview = pd.read_csv(abs_path, nrows=preview_limit)
                elif file.endswith('.xlsx'):
                    preview = pd.read_excel(abs_path, nrows=preview_limit)
                display(preview)
            except Exception as e:
                print(f"⚠️ Could not preview: {e}")

# Run it on your full project directory (not just /data)
describe_directory(BASE_DIR, preview_limit=5)


🔍 Scanning: /home/rohit/Desktop/Projects/C-Elegans



Unnamed: 0,Path,Size (KB),Extension
0,CElegansNeuroML/.git/HEAD,0.02,
1,CElegansNeuroML/.git/config,0.30,
2,CElegansNeuroML/.git/description,0.07,
3,CElegansNeuroML/.git/hooks/applypatch-msg.sample,0.47,.sample
4,CElegansNeuroML/.git/hooks/commit-msg.sample,0.88,.sample
...,...,...,...
3214,file_index_master.csv,631.59,.csv
3215,merged_edges.csv,1420.74,.csv
3216,merged_edges_with_relatedness.csv,1555.43,.csv
3217,merged_lineage_table.csv,575.13,.csv



📄 Preview: c302/c302/NeuronConnectFormatted.xlsx


Unnamed: 0,Neuron 1,Neuron 2,Type,Nbr
0,ADAL,ADAR,EJ,1
1,ADAL,ADEL,Rp,1
2,ADAL,ADFL,Rp,1
3,ADAL,ADFL,EJ,1
4,ADAL,AIAL,Rp,1



📄 Preview: c302/c302/data/Bentley_et_al_2016_expression.csv


Unnamed: 0,Entity1,Relationship,Entity2,Evidence,Evidence URL
0,ADAL,Neuropeptide,PDF-1,"Janssen et al., 2008",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
1,ADAR,Neuropeptide,PDF-1,"Janssen et al., 2008",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,ADEL,Neuropeptide,PDF-1,"Janssen et al., 2008",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
3,ADEL,Neurotransmitter,Dopamine,"McDonald et al., 2007",https://www.ncbi.nlm.nih.gov/pubmed/18094261
4,ADEL,Neurotransmitter,Dopamine,"Suo et al., 2003",https://www.ncbi.nlm.nih.gov/pubmed/12887685



📄 Preview: c302/c302/data/NeuronConnectFormatted.xlsx


Unnamed: 0,Neuron 1,Neuron 2,Type,Nbr
0,ADAL,ADAR,EJ,1
1,ADAL,ADEL,Rp,1
2,ADAL,ADFL,Rp,1
3,ADAL,ADFL,EJ,1
4,ADAL,AIAL,Rp,1



📄 Preview: c302/c302/data/SI 5 Connectome adjacency matrices.xlsx


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,PHARYNX,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 447,Unnamed: 448,Unnamed: 449,Unnamed: 450,Unnamed: 451,Unnamed: 452,Unnamed: 453,Unnamed: 454,Unnamed: 455,Unnamed: 456
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,I1L,I1R,I2L,I2R,I3,I4,I5,...,VC06,vm2AL,vm2AR,vm1AL,vm1AR,vm1PL,vm1PR,vm2PL,vm2PR,
2,PHARYNX,,I1L,,,10,,3,,2,...,,,,,,,,,,I1L
3,,,I1R,,,,6,1,,2,...,,,,,,,,,,I1R
4,,,I2L,2,,,3,,13,2,...,,,,,,,,,,I2L



📄 Preview: c302/c302/data/aconnectome_white_1986_A.csv


Unnamed: 0,pre post type synapses
0,ADAL\tADFL\telectrical\t1
1,ADAL\tAIBL\tchemical\t1
2,ADAL\tAIBR\tchemical\t2
3,ADAL\tASHL\telectrical\t1
4,ADAL\tAVAR\tchemical\t2



📄 Preview: c302/c302/data/aconnectome_white_1986_L4.csv


Unnamed: 0,pre\tpost\ttype\tsynapses
0,ADAL\tADFL\telectrical\t2
1,ADAL\tAIBR\tchemical\t1
2,ADAL\tASHL\telectrical\t6
3,ADAL\tAVAL\tchemical\t2
4,ADAL\tAVAR\tchemical\t1



📄 Preview: c302/c302/data/aconnectome_white_1986_whole.csv


Unnamed: 0,pre\tpost\ttype\tsynapses
0,ADAL\tADFL\telectrical\t1
1,ADAL\tAIBL\tchemical\t1
2,ADAL\tAIBR\tchemical\t2
3,ADAL\tASHL\telectrical\t1
4,ADAL\tAVAR\tchemical\t2



📄 Preview: c302/c302/data/herm_full_edgelist.csv


Unnamed: 0,Source,Target,Weight,Type
0,I1L,I2L,10,chemical
1,I1L,I3,3,chemical
2,I1L,I5,2,chemical
3,I1L,I6,1,chemical
4,I1L,M2L,3,chemical



📄 Preview: c302/c302/data/herm_full_edgelist_MODIFIED.csv


Unnamed: 0,Source,Target,Weight,Type
0,I1L,I2L,10,chemical
1,I1L,I3,3,chemical
2,I1L,I5,2,chemical
3,I1L,I6,1,chemical
4,I1L,M2L,3,chemical



📄 Preview: c302/c302/data/synapse_count_matrices.xlsx


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Pre,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 173,Unnamed: 174,Unnamed: 175,Unnamed: 176,Unnamed: 177,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182
0,,,,Sensory,,,,,,,...,,,,,,,,,,
1,,,,ADFL,ADFR,ADLL,ADLR,AFDL,AFDR,ALML,...,PVNL,PVNR,PVQL,PVQR,RICL,RICR,RID,RIS,RMGL,RMGR
2,,,,,,,,,,,...,,,,,,,,,,
3,Post,Sensory,ADFL,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,,ADFR,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



📄 Preview: c302/c302/data/witvliet_2020_1.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,ADAR,electrical,1
1,ADAL,AIBR,chemical,1
2,ADAL,ASHL,electrical,9
3,ADAL,ASKL,electrical,1
4,ADAL,AVAL,chemical,1



📄 Preview: c302/c302/data/witvliet_2020_2.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,ADAR,electrical,1
1,ADAL,AIBR,chemical,1
2,ADAL,ASHL,electrical,2
3,ADAL,AVAR,chemical,2
4,ADAL,AVBL,chemical,1



📄 Preview: c302/c302/data/witvliet_2020_3.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,ADAR,electrical,1
1,ADAL,ADFL,electrical,1
2,ADAL,ASHL,electrical,1
3,ADAL,ASKL,electrical,1
4,ADAL,AVAL,chemical,1



📄 Preview: c302/c302/data/witvliet_2020_4.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,ADAR,electrical,1
1,ADAL,AIBR,chemical,1
2,ADAL,AIZL,chemical,1
3,ADAL,ASHL,electrical,1
4,ADAL,ASKL,electrical,1



📄 Preview: c302/c302/data/witvliet_2020_5.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,ADFL,electrical,1
1,ADAL,ASHL,chemical,1
2,ADAL,ASHL,electrical,3
3,ADAL,ASKL,electrical,1
4,ADAL,AVAL,chemical,1



📄 Preview: c302/c302/data/witvliet_2020_6.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,AIAL,chemical,1
1,ADAL,AIBL,chemical,1
2,ADAL,ASHL,electrical,2
3,ADAL,AVAL,chemical,3
4,ADAL,AVAR,chemical,1



📄 Preview: c302/c302/data/witvliet_2020_7.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,AIAL,chemical,2
1,ADAL,AIBL,chemical,2
2,ADAL,AIBR,chemical,1
3,ADAL,AVAL,chemical,1
4,ADAL,AVBL,chemical,7



📄 Preview: c302/c302/data/witvliet_2020_8.xlsx


Unnamed: 0,pre,post,type,synapses
0,ADAL,ADAR,electrical,1
1,ADAL,ASHL,electrical,1
2,ADAL,AVAL,chemical,2
3,ADAL,AVAR,chemical,1
4,ADAL,AVBL,chemical,16



📄 Preview: celegans_connectome_data/consensus_connectome_full_nofunc.csv


Unnamed: 0,from_neuron,to_neuron,from_pos,to_pos,from_type,to_type,mean_gap_weight,mean_chem_weight,uncertainty,data_sources
0,ADAL,ADAL,"(0.03, 94.34, 10.31)","(0.03, 94.34, 10.31)",inter,inter,,6.0,,['cook2019']
1,ADAL,ADAR,"(0.03, 94.34, 10.31)","(-1.43, 91.22, -11.64)",inter,inter,2.0,,,"['chklovskii', 'cook2019', 'funconn', 'openwor..."
2,ADAL,ADEL,"(0.03, 94.34, 10.31)","(-0.12, 87.97, 10.5)",inter,sensory,,,,[]
3,ADAL,ADER,"(0.03, 94.34, 10.31)","(-0.58, 83.23, -11.47)",inter,sensory,,,,['funconn']
4,ADAL,ADFL,"(0.03, 94.34, 10.31)","(8.48, 54.18, 7.0)",inter,sensory,2.666667,,,"['chklovskii', 'cook2019', 'openworm', 'white1..."



📄 Preview: celegans_connectome_data/consensus_connectome_full_withfunc.csv


Unnamed: 0,from_neuron,to_neuron,from_pos,to_pos,from_type,to_type,mean_gap_weight,mean_chem_weight,uncertainty,functional_weight,data_sources
0,ADAL,ADAL,"(0.03, 94.34, 10.31)","(0.03, 94.34, 10.31)",inter,inter,,6.0,,,['cook2019']
1,ADAL,ADAR,"(0.03, 94.34, 10.31)","(-1.43, 91.22, -11.64)",inter,inter,2.0,,,0.091616,"['chklovskii', 'cook2019', 'funconn', 'openwor..."
2,ADAL,ADEL,"(0.03, 94.34, 10.31)","(-0.12, 87.97, 10.5)",inter,sensory,,,,,[]
3,ADAL,ADER,"(0.03, 94.34, 10.31)","(-0.58, 83.23, -11.47)",inter,sensory,,,,0.014166,['funconn']
4,ADAL,ADFL,"(0.03, 94.34, 10.31)","(8.48, 54.18, 7.0)",inter,sensory,2.666667,,,,"['chklovskii', 'cook2019', 'openworm', 'white1..."



📄 Preview: celegans_connectome_data/consensus_connectome_trimmed_nofunc.csv


Unnamed: 0,from_neuron,to_neuron,from_pos,to_pos,from_type,to_type,mean_gap_weight,mean_chem_weight,uncertainty,data_sources
0,ADAL,ADAL,"(0.03, 94.34, 10.31)","(0.03, 94.34, 10.31)",inter,inter,,6.0,,['cook2019']
1,ADAL,ADAR,"(0.03, 94.34, 10.31)","(-1.43, 91.22, -11.64)",inter,inter,2.0,,,"['chklovskii', 'cook2019', 'funconn', 'openwor..."
2,ADAL,ADFL,"(0.03, 94.34, 10.31)","(8.48, 54.18, 7.0)",inter,sensory,2.666667,,,"['chklovskii', 'cook2019', 'openworm', 'white1..."
3,ADAL,ADLL,"(0.03, 94.34, 10.31)","(12.05, 59.13, 3.86)",inter,sensory,,2.0,,['cook2019']
4,ADAL,AIAL,"(0.03, 94.34, 10.31)","(-4.67, 66.74, 0.66)",inter,inter,,2.0,,"['cook2019', 'witvliet2020_7']"



📄 Preview: celegans_connectome_data/consensus_connectome_trimmed_withfunc.csv


Unnamed: 0,from_neuron,to_neuron,from_pos,to_pos,from_type,to_type,mean_gap_weight,mean_chem_weight,uncertainty,functional_weight,data_sources
0,ADAL,ADAL,"(0.03, 94.34, 10.31)","(0.03, 94.34, 10.31)",inter,inter,,6.0,,,['cook2019']
1,ADAL,ADAR,"(0.03, 94.34, 10.31)","(-1.43, 91.22, -11.64)",inter,inter,2.0,,,0.091616,"['chklovskii', 'cook2019', 'funconn', 'openwor..."
2,ADAL,ADER,"(0.03, 94.34, 10.31)","(-0.58, 83.23, -11.47)",inter,sensory,,,,0.014166,['funconn']
3,ADAL,ADFL,"(0.03, 94.34, 10.31)","(8.48, 54.18, 7.0)",inter,sensory,2.666667,,,,"['chklovskii', 'cook2019', 'openworm', 'white1..."
4,ADAL,ADLL,"(0.03, 94.34, 10.31)","(12.05, 59.13, 3.86)",inter,sensory,,2.0,,,['cook2019']



📄 Preview: file_index_master.csv


Unnamed: 0,neuron,filename,full_path,extension,modality,directory
0,unknown,c_elegans_connectome_simplified.graphml,/home/rohit/Desktop/Projects/C-Elegans/c_elega...,.graphml,other,/home/rohit/Desktop/Projects/C-Elegans
1,unknown,connectome.pdf,/home/rohit/Desktop/Projects/C-Elegans/connect...,.pdf,other,/home/rohit/Desktop/Projects/C-Elegans
2,unknown,celegans.ipynb,/home/rohit/Desktop/Projects/C-Elegans/celegan...,.ipynb,other,/home/rohit/Desktop/Projects/C-Elegans
3,unknown,NeuronLineage_Part2.xls,/home/rohit/Desktop/Projects/C-Elegans/NeuronL...,.xls,other,/home/rohit/Desktop/Projects/C-Elegans
4,unknown,c_elegans_connectome.graphml,/home/rohit/Desktop/Projects/C-Elegans/c_elega...,.graphml,other,/home/rohit/Desktop/Projects/C-Elegans



📄 Preview: merged_edges.csv


Unnamed: 0,Source,Target,Type,SynapseCount,SourceFile
0,ADAL,ADAR,electrical,1,witvliet_2020_2.xlsx
1,ADAL,AIBR,chemical,1,witvliet_2020_2.xlsx
2,ADAL,ASHL,electrical,2,witvliet_2020_2.xlsx
3,ADAL,AVAR,chemical,2,witvliet_2020_2.xlsx
4,ADAL,AVBL,chemical,1,witvliet_2020_2.xlsx



📄 Preview: merged_edges_with_relatedness.csv


Unnamed: 0,Source,Target,Type,SynapseCount,SourceFile,Relatedness
0,AINL,CEPshVR,chemical,1,witvliet_2020_2.xlsx,17.0
1,AINR,CEPshDL,chemical,1,witvliet_2020_2.xlsx,16.0
2,ALA,CEPshVL,chemical,1,witvliet_2020_2.xlsx,14.0
3,ALA,CEPshVR,chemical,3,witvliet_2020_2.xlsx,18.0
4,ASIR,CEPshDR,chemical,1,witvliet_2020_2.xlsx,14.0



📄 Preview: merged_lineage_table.csv


Unnamed: 0,NeuronA,NeuronB,Relatedness
0,ADAL,ADAR,17.0
1,ADAL,ADEL,1.0
2,ADAL,ADER,17.0
3,ADAL,ADFL,18.0
4,ADAL,ADFR,16.0


In [6]:
# 📍 Key merged datasets
MERGED_EDGES_PATH = BASE_DIR / "merged_edges.csv"
MERGED_EDGES_WITH_REL_PATH = BASE_DIR / "merged_edges_with_relatedness.csv"
LINEAGE_PATH = BASE_DIR / "merged_lineage_table.csv"

# 📍 Alternative raw sources (for future validation)
CELEGANS_RAW_FOLDER = BASE_DIR / "c302/c302/data/"


In [7]:
# Load main files
edges_df = pd.read_csv(MERGED_EDGES_PATH)
edges_rel_df = pd.read_csv(MERGED_EDGES_WITH_REL_PATH)
lineage_df = pd.read_csv(LINEAGE_PATH)

# Quick inspections
display(edges_df.head())
display(edges_rel_df.head())
display(lineage_df.head())

print(f"🧠 Unique neurons in edges: {len(set(edges_df['Source']) | set(edges_df['Target']))}")
print(f"🧬 Lineage pairs: {len(lineage_df)}")


Unnamed: 0,Source,Target,Type,SynapseCount,SourceFile
0,ADAL,ADAR,electrical,1,witvliet_2020_2.xlsx
1,ADAL,AIBR,chemical,1,witvliet_2020_2.xlsx
2,ADAL,ASHL,electrical,2,witvliet_2020_2.xlsx
3,ADAL,AVAR,chemical,2,witvliet_2020_2.xlsx
4,ADAL,AVBL,chemical,1,witvliet_2020_2.xlsx


Unnamed: 0,Source,Target,Type,SynapseCount,SourceFile,Relatedness
0,AINL,CEPshVR,chemical,1,witvliet_2020_2.xlsx,17.0
1,AINR,CEPshDL,chemical,1,witvliet_2020_2.xlsx,16.0
2,ALA,CEPshVL,chemical,1,witvliet_2020_2.xlsx,14.0
3,ALA,CEPshVR,chemical,3,witvliet_2020_2.xlsx,18.0
4,ASIR,CEPshDR,chemical,1,witvliet_2020_2.xlsx,14.0


Unnamed: 0,NeuronA,NeuronB,Relatedness
0,ADAL,ADAR,17.0
1,ADAL,ADEL,1.0
2,ADAL,ADER,17.0
3,ADAL,ADFL,18.0
4,ADAL,ADFR,16.0


🧠 Unique neurons in edges: 1533
🧬 Lineage pairs: 38781


In [8]:
import yaml

catalog = {
    "merged_edges": {
        "path": str(MERGED_EDGES_PATH),
        "rows": len(edges_df),
        "columns": list(edges_df.columns),
        "description": "Merged chemical and electrical synapses across Witvliet datasets"
    },
    "merged_edges_with_relatedness": {
        "path": str(MERGED_EDGES_WITH_REL_PATH),
        "rows": len(edges_rel_df),
        "columns": list(edges_rel_df.columns),
        "description": "Edges merged with lineage-based relatedness scores"
    },
    "merged_lineage_table": {
        "path": str(LINEAGE_PATH),
        "rows": len(lineage_df),
        "columns": list(lineage_df.columns),
        "description": "Pairwise lineage distances from developmental tree"
    }
}

with open(BASE_DIR / "data_catalog.yml", "w") as f:
    yaml.dump(catalog, f, sort_keys=False)

print("✅ Data catalog created → data_catalog.yml")


✅ Data catalog created → data_catalog.yml


In [9]:
# Union of all unique neurons across source/target and lineage
unique_neurons = set(edges_df['Source']) | set(edges_df['Target']) | set(lineage_df['NeuronA']) | set(lineage_df['NeuronB'])

with open(BASE_DIR / "unique_neuron_names.txt", "w") as f:
    for name in sorted(unique_neurons):
        f.write(name + "\n")

print(f"🧠 {len(unique_neurons)} unique neurons saved to unique_neuron_names.txt")


🧠 1632 unique neurons saved to unique_neuron_names.txt


In [10]:
# 📦 Required imports
import os
import pandas as pd
import yaml
from pathlib import Path
from tqdm import tqdm

# 🧠 Heuristic classification based on file contents
def classify_data_type(columns, extension):
    colset = set([col.strip().lower() for col in columns])

    if 'source' in colset and 'target' in colset:
        return 'edges'
    if 'neurona' in colset and 'neuronb' in colset:
        return 'lineage'
    if 'from_neuron' in colset and 'to_neuron' in colset:
        return 'coordinates_edges'
    if 'neuron 1' in colset and 'neuron 2' in colset:
        return 'formatted_edges'
    if 'pre' in colset and 'post' in colset and 'type' in colset:
        return 'legacy_edges'
    if 'entity1' in colset and 'relationship' in colset:
        return 'expression_data'
    if extension in ['.nml', '.neuroml']:
        return 'neuroml'
    if extension == '.graphml':
        return 'graphml'
    return 'unknown'

# 📂 Define the base directory to scan
BASE_DIR = Path('/home/rohit/Desktop/Projects/C-Elegans')

# 📋 Data collection
file_records = []

for root, dirs, files in os.walk(BASE_DIR):
    for file in files:
        path = Path(root) / file
        size_kb = path.stat().st_size / 1024
        ext = path.suffix.lower()

        record = {
            'filename': file,
            'full_path': str(path),
            'size_kb': round(size_kb, 2),
            'extension': ext,
            'data_type': None,
            'num_rows': None,
            'num_columns': None,
            'column_names': None,
        }

        # Try to parse with pandas to extract structure
        if ext in ['.csv', '.tsv', '.xls', '.xlsx']:
            try:
                if ext == '.csv':
                    df = pd.read_csv(path, nrows=5)
                elif ext == '.tsv':
                    df = pd.read_csv(path, sep='\t', nrows=5)
                else:
                    df = pd.read_excel(path, nrows=5)

                record['num_rows'], record['num_columns'] = df.shape
                record['column_names'] = df.columns.tolist()
                record['data_type'] = classify_data_type(df.columns, ext)
            except Exception as e:
                record['data_type'] = 'unreadable'
        elif ext in ['.nml', '.neuroml', '.graphml']:
            record['data_type'] = classify_data_type([], ext)

        file_records.append(record)

# 📄 Save to CSV
inventory_df = pd.DataFrame(file_records)
inventory_df.to_csv("structured_data_inventory.csv", index=False)

# 📝 Save to YAML (for indexing)
yaml_catalog = {
    'files': [
        {
            'path': rec['full_path'],
            'extension': rec['extension'],
            'size_kb': rec['size_kb'],
            'data_type': rec['data_type'],
            'num_rows': rec['num_rows'],
            'num_columns': rec['num_columns'],
        }
        for rec in file_records
    ]
}

with open("data_catalog.yml", "w") as f:
    yaml.dump(yaml_catalog, f)

print("✅ Scan complete.")
print(f"🗃️  Total files scanned: {len(file_records)}")
print("📄 Output saved to: structured_data_inventory.csv and data_catalog.yml")


✅ Scan complete.
🗃️  Total files scanned: 3221
📄 Output saved to: structured_data_inventory.csv and data_catalog.yml


In [11]:
import os

base_dir = "/home/rohit/Desktop/Projects/C-Elegans/data"

subdirs = [
    "connectivity/raw", "connectivity/processed",
    "lineage/raw", "lineage/processed",
    "neuroml/cell_library", "neuroml/circuits",
    "metadata", "temp"
]

for sub in subdirs:
    os.makedirs(os.path.join(base_dir, sub), exist_ok=True)

print("✅ Directory structure created.")


✅ Directory structure created.


In [13]:
import pandas as pd

inventory_path = "/home/rohit/Desktop/Projects/C-Elegans/structured_data_inventory.csv"
inventory = pd.read_csv(inventory_path)

print("📄 Columns in structured_data_inventory.csv:")
print(inventory.columns.tolist())


📄 Columns in structured_data_inventory.csv:
['filename', 'full_path', 'size_kb', 'extension', 'data_type', 'num_rows', 'num_columns', 'column_names']


In [14]:
import pandas as pd
import os

# Load the inventory
inventory = pd.read_csv("/home/rohit/Desktop/Projects/C-Elegans/structured_data_inventory.csv")
base_data_dir = "/home/rohit/Desktop/Projects/C-Elegans/data"

# Heuristic category assignment
def classify_row(row):
    name = row['filename'].lower()
    path = row['full_path'].lower()

    if "witvliet" in name or "white" in name or "herm" in name or "adjacency" in name:
        return "connectivity/raw"
    elif "merged_edges" in name or "consensus_connectome" in name:
        return "connectivity/processed"
    elif "lineage" in name or "relatedness" in name:
        return "lineage/raw" if "merged" not in name else "lineage/processed"
    elif name.endswith(".nml") or "cell" in name or "neuron" in name:
        return "neuroml/cell_library"
    elif "circuit" in name or "network" in name:
        return "neuroml/circuits"
    elif "file_index" in name or "data_catalog" in name:
        return "metadata"
    else:
        return "temp"

# Construct the manifest
manifest = inventory.copy()
manifest['original_path'] = manifest['full_path']
manifest['category'] = manifest.apply(classify_row, axis=1)
manifest['canonical_path'] = manifest.apply(
    lambda row: os.path.join(base_data_dir, row['category'], row['filename']),
    axis=1
)
manifest['notes'] = manifest['extension'].apply(lambda ext: "🧪 CSV" if ext == '.csv' else "")

# Final columns
manifest = manifest[['original_path', 'filename', 'canonical_path', 'category', 'notes']]

# Save
os.makedirs("/home/rohit/Desktop/Projects/C-Elegans/utils", exist_ok=True)
manifest_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest.csv"
manifest.to_csv(manifest_path, index=False)

print(f"✅ Saved manifest → {manifest_path}")


✅ Saved manifest → /home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest.csv


In [15]:
import os
import shutil
import pandas as pd

# Load the manifest
manifest_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest.csv"
manifest = pd.read_csv(manifest_path)

# Move each file
for idx, row in manifest.iterrows():
    src = row['original_path']
    dst = row['canonical_path']
    dst_dir = os.path.dirname(dst)

    try:
        os.makedirs(dst_dir, exist_ok=True)
        if os.path.exists(src):
            shutil.move(src, dst)
            print(f"✅ Moved: {src} → {dst}")
        else:
            print(f"⚠️ Missing: {src}")
    except Exception as e:
        print(f"❌ Error moving {src} → {dst}: {e}")


✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/connectome.pdf → /home/rohit/Desktop/Projects/C-Elegans/data/temp/connectome.pdf
✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/merged_edges_with_relatedness.csv → /home/rohit/Desktop/Projects/C-Elegans/data/connectivity/processed/merged_edges_with_relatedness.csv
✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/celegans.ipynb → /home/rohit/Desktop/Projects/C-Elegans/data/temp/celegans.ipynb
✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/NeuronLineage_Part2.xls → /home/rohit/Desktop/Projects/C-Elegans/data/lineage/raw/NeuronLineage_Part2.xls
✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/merged_lineage_table.csv → /home/rohit/Desktop/Projects/C-Elegans/data/lineage/processed/merged_lineage_table.csv
✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/merged_edges.csv → /home/rohit/Desktop/Projects/C-Elegans/data/connectivity/processed/merged_edges.csv
✅ Moved: /home/rohit/Desktop/Projects/C-Elegans/celegans.pdf → /home/rohit/Desktop/Project

In [16]:
import os
import pandas as pd

# Load manifest
manifest_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest.csv"
manifest = pd.read_csv(manifest_path)

# Verify each canonical path
manifest['file_exists'] = manifest['canonical_path'].apply(lambda x: os.path.exists(x))

# Report missing files
missing = manifest[~manifest['file_exists']]
if not missing.empty:
    print("⚠️ Missing files:")
    print(missing[['original_path', 'canonical_path']])
else:
    print("✅ All files successfully moved and verified.")

# Save clean manifest
clean_manifest_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest_verified.csv"
manifest.to_csv(clean_manifest_path, index=False)

print(f"📦 Verified manifest saved to: {clean_manifest_path}")


✅ All files successfully moved and verified.
📦 Verified manifest saved to: /home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest_verified.csv


In [18]:
import os
import pandas as pd

# Define base directory
base_dir = "/home/rohit/Desktop/Projects/C-Elegans/data"

# Recursively scan for files
file_records = []
for root, dirs, files in os.walk(base_dir):
    for name in files:
        full_path = os.path.join(root, name)
        rel_path = os.path.relpath(full_path, base_dir)
        size_kb = round(os.path.getsize(full_path) / 1024, 2)
        ext = os.path.splitext(name)[1].lower()
        file_records.append({
            "filename": name,
            "relative_path": rel_path,
            "full_path": full_path,
            "extension": ext,
            "size_kb": size_kb
        })

# Create DataFrame
file_df = pd.DataFrame(file_records)

# Save to CSV
scan_output_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/postmove_directory_inventory.csv"
file_df.to_csv(scan_output_path, index=False)

# Preview
print("✅ Directory re-scan complete.")
print(f"📦 Inventory saved to: {scan_output_path}")
print("\n📂 Top 10 entries:")
print(file_df.head(10))


✅ Directory re-scan complete.
📦 Inventory saved to: /home/rohit/Desktop/Projects/C-Elegans/utils/postmove_directory_inventory.csv

📂 Top 10 entries:
                   filename                  relative_path  \
0     .test.pharyngealC.mep     temp/.test.pharyngealC.mep   
1             ASJR.java.xml             temp/ASJR.java.xml   
2            URXL.morph.xml            temp/URXL.morph.xml   
3      LEMS_c302_A_Syns.xml      temp/LEMS_c302_A_Syns.xml   
4           SMDDR.morph.xml           temp/SMDDR.morph.xml   
5             ADFR.java.xml             temp/ADFR.java.xml   
6  .test.pharC1.jnmlnrn.omt  temp/.test.pharC1.jnmlnrn.omt   
7               M4.java.xml               temp/M4.java.xml   
8   Serotonin_Glutamate.xml   temp/Serotonin_Glutamate.xml   
9            VA10.morph.xml            temp/VA10.morph.xml   

                                           full_path extension  size_kb  
0  /home/rohit/Desktop/Projects/C-Elegans/data/te...      .mep     0.17  
1  /home/rohit/Deskt

In [19]:
import pandas as pd
import os
import json

# Load inventory
inventory_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/postmove_directory_inventory.csv"
df = pd.read_csv(inventory_path)

# Filter for data files
data_files = df[df['extension'].isin(['.csv', '.xls', '.xlsx'])]

# Extract column name previews
column_schema = {}

for _, row in data_files.iterrows():
    path = row['full_path']
    ext = row['extension']
    try:
        if ext == ".csv":
            cols = pd.read_csv(path, nrows=1).columns.tolist()
        elif ext in [".xls", ".xlsx"]:
            cols = pd.read_excel(path, nrows=1).columns.tolist()
        else:
            continue
        column_schema[row['relative_path']] = cols
    except Exception as e:
        column_schema[row['relative_path']] = f"⚠️ Error reading file: {str(e)}"

# Save to JSON
output_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/canonical_column_names.json"
with open(output_path, "w") as f:
    json.dump(column_schema, f, indent=2)

print(f"✅ Canonical column schema saved to:\n{output_path}")


✅ Canonical column schema saved to:
/home/rohit/Desktop/Projects/C-Elegans/utils/canonical_column_names.json


In [21]:
import pandas as pd
import os

# === Paths ===
base_dir = "/home/rohit/Desktop/Projects/C-Elegans"
utils_dir = os.path.join(base_dir, "utils")

pre_move_path = os.path.join("structured_data_inventory.csv")
post_move_path = os.path.join(utils_dir, "postmove_directory_inventory.csv")
report_path = os.path.join(utils_dir, "data_validation_report.csv")

# === Load inventories ===
pre = pd.read_csv(pre_move_path)
post = pd.read_csv(post_move_path)

# === Normalize for comparison ===
pre["basename"] = pre["full_path"].apply(os.path.basename)
post["basename"] = post["Path"].apply(os.path.basename)
pre["size_kb"] = pre["size_kb"].round(2)
post["Size (KB)"] = post["Size (KB)"].round(2)

# === Match on basename and size ===
pre["status"] = pre.apply(lambda row: (
    "✅ Moved"
    if ((post["basename"] == row["basename"]) & 
        (post["Size (KB)"] == row["size_kb"])).any()
    else "❌ Missing"
), axis=1)

# === Detect extra (new) files ===
extra = post[~post["basename"].isin(pre["basename"])].copy()
extra["status"] = "🆕 New (not in original inventory)"

# === Combine report ===
report = pd.concat([
    pre[["basename", "size_kb", "status"]],
    extra[["basename", "Size (KB)", "status"]].rename(columns={"Size (KB)": "size_kb"})
], ignore_index=True)

# === Save and show ===
report.to_csv(report_path, index=False)
print(f"📦 Validation report saved to:\n{report_path}")
report.head(10)


KeyError: 'Path'

In [22]:
import pandas as pd

postmove_path = "/home/rohit/Desktop/Projects/C-Elegans/utils/postmove_directory_inventory.csv"
post = pd.read_csv(postmove_path)

print("🧾 Columns in post-move inventory:")
print(post.columns.tolist())


🧾 Columns in post-move inventory:
['filename', 'relative_path', 'full_path', 'extension', 'size_kb']


In [24]:
import pandas as pd
import os

# === Paths ===
base_dir = "/home/rohit/Desktop/Projects/C-Elegans"
utils_dir = os.path.join(base_dir, "utils")

pre_move_path = "structured_data_inventory.csv"
post_move_path = os.path.join(utils_dir, "postmove_directory_inventory.csv")
report_path = os.path.join(utils_dir, "data_validation_report.csv")

# === Load inventories ===
pre = pd.read_csv(pre_move_path)
post = pd.read_csv(post_move_path)

# === Normalize for comparison ===
pre["basename"] = pre["full_path"].apply(os.path.basename)
post["basename"] = post["full_path"].apply(os.path.basename)
pre["size_kb"] = pre["size_kb"].round(2)
post["size_kb"] = post["size_kb"].round(2)

# === Match on basename and size ===
pre["status"] = pre.apply(lambda row: (
    "✅ Moved"
    if ((post["basename"] == row["basename"]) & 
        (post["size_kb"] == row["size_kb"])).any()
    else "❌ Missing"
), axis=1)

# === Detect extra (new) files ===
extra = post[~post["basename"].isin(pre["basename"])].copy()
extra["status"] = "🆕 New (not in original inventory)"
extra = extra.rename(columns={"size_kb": "size_kb"})  # For uniformity

# === Combine report ===
report = pd.concat([
    pre[["basename", "size_kb", "status"]],
    extra[["basename", "size_kb", "status"]]
], ignore_index=True)

# === Save report ===
report.to_csv(report_path, index=False)
print(f"✅ Data validation report saved:\n{report_path}")

# Preview first few mismatches
report[report["status"] != "✅ Moved"].head(10)


✅ Data validation report saved:
/home/rohit/Desktop/Projects/C-Elegans/utils/data_validation_report.csv


Unnamed: 0,basename,size_kb,status
2,celegans.ipynb,2766.29,❌ Missing
12,data_catalog.yml,0.74,❌ Missing
14,.openworm.yml,0.56,❌ Missing
16,.gitignore,4.29,❌ Missing
18,README.md,15.48,❌ Missing
19,.test.pharyngealC.mep,0.12,❌ Missing
20,LEMS_c302_A_Syns.xml,5.24,❌ Missing
22,c302_C1_Full.net.nml,1516.94,❌ Missing
23,c302_B_Oscillator.net.nml,29.42,❌ Missing
24,LEMS_c302_C_Oscillator.xml,10.15,❌ Missing


In [26]:
import os
import shutil
import pandas as pd

base_path = "/home/rohit/Desktop/Projects/C-Elegans"
missing_files = [
    "LEMS_c302_A_Syns.xml",
    "LEMS_c302_C_Oscillator.xml",
    "c302_B_Oscillator.net.nml",
    "c302_C1_Full.net.nml"
]
nml_target_dir = os.path.join(base_path, "data", "nml")
os.makedirs(nml_target_dir, exist_ok=True)

# Step 1: Move files
moved = []
for fname in missing_files:
    for root, _, files in os.walk(base_path):
        if fname in files:
            src = os.path.join(root, fname)
            dst = os.path.join(nml_target_dir, fname)
            shutil.move(src, dst)
            moved.append((fname, src, dst))
            break

# Step 2: Load and update manifest
manifest_path = os.path.join(base_path, "utils", "data_restructure_manifest_verified.csv")
manifest = pd.read_csv(manifest_path)

# New rows for moved files
new_rows = pd.DataFrame([{
    "basename": fname,
    "original_path": src,
    "canonical_path": dst,
    "category": "nml",
    "status": "✅ Moved"
} for fname, src, dst in moved])

# Concatenate and save
manifest = pd.concat([manifest, new_rows], ignore_index=True)
manifest.to_csv(manifest_path, index=False)

# Final print
print("✅ Moved the following NML/LEMS files to /data/nml/:")
for f in moved:
    print(f" - {f[0]}")

print(f"\n📦 Updated manifest saved to:\n{manifest_path}")


✅ Moved the following NML/LEMS files to /data/nml/:
 - LEMS_c302_A_Syns.xml
 - LEMS_c302_C_Oscillator.xml
 - c302_B_Oscillator.net.nml
 - c302_C1_Full.net.nml

📦 Updated manifest saved to:
/home/rohit/Desktop/Projects/C-Elegans/utils/data_restructure_manifest_verified.csv


In [27]:
import os
import pandas as pd

# === Paths ===
base_path = "/home/rohit/Desktop/Projects/C-Elegans"
manifest_path = os.path.join(base_path, "utils", "data_restructure_manifest_verified.csv")

# === Load manifest ===
manifest = pd.read_csv(manifest_path)
manifest["basename"] = manifest["canonical_path"].apply(os.path.basename)

# === Recursively walk the directory ===
file_records = []
for root, _, files in os.walk(base_path):
    for f in files:
        full_path = os.path.join(root, f)
        rel_path = os.path.relpath(full_path, base_path)
        file_records.append({
            "basename": f,
            "relative_path": rel_path,
            "full_path": full_path,
            "size_kb": os.path.getsize(full_path) / 1024
        })

scanned_df = pd.DataFrame(file_records)

# === Merge with manifest ===
merged = pd.merge(manifest, scanned_df, on="basename", how="outer", indicator=True)

# === Tag result ===
merged["status"] = merged["_merge"].map({
    "both": "✅ Verified",
    "left_only": "❌ Missing in Filesystem",
    "right_only": "🆕 Not in Manifest"
})

# === Export results ===
output_path = os.path.join(base_path, "utils", "final_verification_report.csv")
merged.to_csv(output_path, index=False)

# === Summary ===
summary = merged["status"].value_counts().to_dict()
print("✅ Post-Move Scan Complete:")
print(f" - Verified: {summary.get('✅ Verified', 0)}")
print(f" - Missing: {summary.get('❌ Missing in Filesystem', 0)}")
print(f" - Unexpected: {summary.get('🆕 Not in Manifest', 0)}")
print(f"\n📄 Report saved to:\n{output_path}")


✅ Post-Move Scan Complete:
 - Verified: 3226
 - Missing: 0
 - Unexpected: 6

📄 Report saved to:
/home/rohit/Desktop/Projects/C-Elegans/utils/final_verification_report.csv
