In [None]:
# prompt: mount drive, and Chdir to MRSA Dataset

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MRSA datasets
!ls


Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/13LGqe__ULhjG0hZmpgxz8WqE0fNXsVNQ/MRSA datasets
 abricate	        Datasets_fcgr.zip	   'Metadata Tables'
 Conversion_Table.csv   Data_Statistics.csv	    models
'Data Labels'	        fcgr-0.1-py3-none-any.whl   Models
 Datasets	        full_drug_summary.csv
 Datasets_fcgr	        Master_Table.csv


In [None]:
!pip install biopython
!pip install fcgr-0.1-py3-none-any.whl

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Processing ./fcgr-0.1-py3-none-any.whl
Collecting collection (from fcgr==0.1)
  Downloading collection-0.1.6.tar.gz (5.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: collection
  Building wheel for collection (setup.py) ... [?25l[?25hdone
  Created wheel for collection: filename=collection-0.1.6-py3-none-any.whl size=5098 sha256=e21c0a166876f7ca2d3100a07a5850355c7b790912cf20028cf806f72f71b416
  Stored in directory: /root/.cache/pip/wheels/1c/b3/7a/6dd2fd1aa32817819c8c91da9a6b5919148da6eb5855ab0986
Successfully built c

In [None]:
# prompt: load Master_Table.csv and print columns

import pandas as pd

# Assuming Master_Table.csv is in the current directory
# If not, replace 'Master_Table.csv' with the full path
try:
  df = pd.read_csv('Master_Table.csv')
  print(df.columns)
except FileNotFoundError:
  print("Error: Master_Table.csv not found in the current directory.")
except pd.errors.ParserError:
  print("Error: Could not parse Master_Table.csv. Please check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


  df = pd.read_csv('Master_Table.csv')


Index(['Species', 'Dataset', 'File Name', 'Streptomycin', 'Amikacin/Kanamycin',
       'Macrolides', 'Vancomycin', 'Ceftriaxone', 'Oxacillin', 'Clindamycin',
       'Fusidic acid', 'Beta-lactam', 'Erythromycin', 'Linezolid',
       'Methicillin', 'Gentamicin', 'Kanamycin/Tobramycin',
       'Trimethoprim-Sulfamethoxazole', 'Fosfomycin', 'Ciprofloxacin',
       'Tetracycline', 'Streptothricin', 'Nitrofurantoin', 'Fluoroquinolones',
       'Chloramphenicol', 'Penicillin', 'Trimethoprim', 'Lincosamide',
       'Daptomycin', 'Cefoxitin', 'Amikacin', 'Ampicillin-Sulbactam',
       'Cefepime', 'Cefotaxime', 'Ceftazidime', 'Colistin', 'Doripenem',
       'Imipenem', 'Levofloxacin', 'Meropenem', 'Minocycline', 'Moxifloxacin',
       'Polymyxin_B', 'Tigecycline', 'Tobramycin', 'Ampicillin',
       'Amoxicillin-Clavulanic acid', 'Azithromycin', 'Sulfisoxazole',
       'Kanamycin', 'Nalidixic acid', 'Ceftiofur', 'Amoxicillin',
       'Capreomycin', 'Clarithromycin', 'Clofazimine', 'Cycloserine',


In [None]:
#!/usr/bin/env python3
"""
FCGR Conversion Pipeline with Progress Logging (updated to use raw File Name)

- Reads Master_table.csv with columns 'Species', 'Dataset', 'File Name'
- Ensures Conversion_Table.csv exists and includes all master samples
- Mirrors Datasets/ to Datasets_fcgr/
- Uses multiprocessing to generate FCGR matrices
- Loads FASTA via Biopython SeqIO without appending extra extensions
- Tracks progress with print statements
"""
import os
from pathlib import Path
import pandas as pd
import numpy as np
from multiprocessing import Pool
import fcgr  # your existing fcgr module
from Bio import SeqIO

# ---- CONFIG ----
ROOT_DIR        = Path("/content/drive/MyDrive/MRSA datasets")
MASTER_PATH     = ROOT_DIR / "Master_Table.csv"
CONVERSION_PATH = ROOT_DIR / "Conversion_Table.csv"
DATASETS_DIR    = ROOT_DIR / "Datasets"
FCGR_DIR        = ROOT_DIR / "Datasets_fcgr"
KMER_LENGTH     = 8
MAX_WORKERS     = 6  # adjust as needed

# ---- FASTA LOADER ----
def load_all_sequences(filepath):
    """
    Load and concatenate all sequences from a .fasta/.fna file using Biopython.
    """
    records = SeqIO.parse(filepath, "fasta")
    full_sequence = "".join(str(record.seq).upper() for record in records)
    return full_sequence

# ---- CONVERSION TABLE ----
def ensure_conversion_table(master_df):
    print(f"[INFO] Loading/Creating conversion table at {CONVERSION_PATH}")
    if not CONVERSION_PATH.exists():
        df = master_df[['Species','Dataset','File Name']].copy()
        df['conversion_progress'] = 'not converted'
        df.to_csv(CONVERSION_PATH, index=False)
        print(f"[INFO] Created new Conversion_Table.csv with {len(df)} entries")
        return df
    else:
        existing = pd.read_csv(CONVERSION_PATH)
        merged = master_df[['Species','Dataset','File Name']].merge(
            existing, on=['Species','Dataset','File Name'], how='left')
        merged['conversion_progress'] = merged['conversion_progress'].fillna('not converted')
        print(f"[INFO] Loaded existing Conversion_Table.csv with {len(existing)} entries; merged to {len(merged)} entries")
        return merged

# ---- DIR MIRROR ----
def create_fcgr_dir_structure():
    print(f"[INFO] Mirroring directory structure from {DATASETS_DIR} to {FCGR_DIR}")
    FCGR_DIR.mkdir(exist_ok=True)
    for sp_dir in DATASETS_DIR.iterdir():
        if sp_dir.is_dir():
            for ds_dir in sp_dir.iterdir():
                if ds_dir.is_dir():
                    out_dir = FCGR_DIR / sp_dir.name / ds_dir.name
                    out_dir.mkdir(parents=True, exist_ok=True)
    print(f"[INFO] Directory structure ready.")

# ---- TASK BUILDER ----
def build_tasks(df):
    tasks = [
        (row['Species'], row['Dataset'], row['File Name'])
        for _, row in df.iterrows()
        if row['conversion_progress'] in ('not converted','converting')
    ]
    print(f"[INFO] Found {len(tasks)} samples to process.")
    return tasks

# ---- PROCESS SINGLE SAMPLE ----
def process_sample(species, dataset, file_name):
    # Use raw file_name (including extension) without adding .fasta
    fasta_path = DATASETS_DIR / species / dataset / file_name
    out_path   = FCGR_DIR   / species / dataset / f"{file_name}.npy"
    worker_id = os.getpid()
    print(f"[WORKER {worker_id}] Starting {species}/{dataset}/{file_name}")
    if out_path.exists():
        os.remove(out_path)
    try:
        seq_str = load_all_sequences(str(fasta_path))
        freq_matrix, _ = fcgr.chaos_frequency_matrix(
            fasta_string=seq_str,
            kmer_length=KMER_LENGTH,
            chaos_game_kmer_array=None,
            pseudo_count=False
        )
        np.save(out_path, freq_matrix)
        print(f"[WORKER {worker_id}] Completed {species}/{dataset}/{file_name}")
        return (species, dataset, file_name, 'converted')
    except Exception as e:
        print(f"[WORKER {worker_id}] Error {species}/{dataset}/{file_name}: {e}")
        return (species, dataset, file_name, f'error: {e}')

# ---- UPDATE TABLE ----
def update_conversion(conversion_df, results):
    print(f"[INFO] Updating conversion table with results...")
    for sp, ds, fn, status in results:
        idx = (
            (conversion_df['Species'] == sp) &
            (conversion_df['Dataset'] == ds) &
            (conversion_df['File Name'] == fn)
        )
        conversion_df.loc[idx, 'conversion_progress'] = status
    conversion_df.to_csv(CONVERSION_PATH, index=False)
    print(f"[INFO] Conversion table updated.")

# ---- MAIN ----
def main():
    print(f"[INFO] Reading master table from {MASTER_PATH}")
    master = pd.read_csv(MASTER_PATH)
    conversion_df = ensure_conversion_table(master)
    create_fcgr_dir_structure()
    tasks = build_tasks(conversion_df)
    if not tasks:
        print("[INFO] Nothing to process.")
        return
    print(f"[INFO] Processing {len(tasks)} samples with {MAX_WORKERS} workers...")
    with Pool(MAX_WORKERS) as pool:
        results = pool.starmap(process_sample, tasks)
    update_conversion(conversion_df, results)
    print("[INFO] Conversion complete.")

if __name__ == '__main__':
    main()



[INFO] Reading master table from /content/drive/MyDrive/MRSA datasets/Master_Table.csv


  master = pd.read_csv(MASTER_PATH)


[INFO] Loading/Creating conversion table at /content/drive/MyDrive/MRSA datasets/Conversion_Table.csv
[INFO] Loaded existing Conversion_Table.csv with 19138 entries; merged to 20710 entries
[INFO] Mirroring directory structure from /content/drive/MyDrive/MRSA datasets/Datasets to /content/drive/MyDrive/MRSA datasets/Datasets_fcgr
[INFO] Directory structure ready.
[INFO] Found 1556 samples to process.
[INFO] Processing 1556 samples with 6 workers...
[WORKER 1111] Starting Klebsiella pneumoniae/PATRIC_klebsiella/573.12858.fna[WORKER 1114] Starting Klebsiella pneumoniae/PATRIC_klebsiella/573.13080.fna[WORKER 1115] Starting Klebsiella pneumoniae/PATRIC_klebsiella/573.13152.fna[WORKER 1113] Starting Klebsiella pneumoniae/PATRIC_klebsiella/573.13005.fna
[WORKER 1116] Starting Klebsiella pneumoniae/PATRIC_klebsiella/573.13229.fna[WORKER 1112] Starting Klebsiella pneumoniae/PATRIC_klebsiella/573.12933.fna




[WORKER 1116] Completed Klebsiella pneumoniae/PATRIC_klebsiella/573.13229.fna
[WORKER