# Preprocess mwTab Files from Metabolomics Workbench
## Study ID: ST000385 and ST000386

In [1]:
# Importing Modules
import pandas as pd
import numpy as np

In [2]:
# Function to convert mwTab to TSV format
def parse_mwTab(input_file, output_file):
    sample_ids = []
    classes = []
    metabolite_names = []
    feature_matrix = []
    in_data_block = False

    print(f"Starting to parse '{input_file}'...")

    try:
        with open(input_file, 'r') as f_in:
            for line in f_in:
                line = line.strip()
                if not line:
                    continue

                if line == 'MS_METABOLITE_DATA_START':
                    in_data_block = True
                    sample_line = next(f_in).strip()
                    sample_ids = sample_line.split('\t')[1:]
                    factor_line = next(f_in).strip()
                    factor_strings = factor_line.split('\t')[1:]
                    for f_str in factor_strings:
                        cls = 'NA'
                        parts = f_str.split('|')
                        for part in parts:
                            if 'Health State:' in part:
                                cls = part.split(':', 1)[1].strip()
                                break
                        classes.append(cls)
                    continue

                if line == 'MS_METABOLITE_DATA_END':
                    in_data_block = False
                    break

                if in_data_block:
                    parts = line.split('\t')
                    metabolite_names.append(parts[0])
                    feature_matrix.append(parts[1:])

        transposed_features = list(zip(*feature_matrix))
        print(f"Parsed {len(sample_ids)} total samples and {len(metabolite_names)} features.")

        rows_written = 0
        with open(output_file, 'w') as f_out:
            header = ['SampleID', 'class'] + metabolite_names
            f_out.write('\t'.join(header) + '\n')
            for i in range(len(sample_ids)):
                if classes[i] == 'NA':
                    continue
                row_data = [sample_ids[i], classes[i]] + list(transposed_features[i])
                f_out.write('\t'.join(row_data) + '\n')
                rows_written += 1

        print(f"\nSuccessfully generated '{output_file}'.")
        print(f"Total rows written: {rows_written} (Filtered out {len(sample_ids) - rows_written} 'NA' samples).")
        print(f"Total columns: {len(header)}")

    except FileNotFoundError:
        print(f"Error: The file '{input_file}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Usage: parse_mwtab('ST000385_AN000620.txt', 'ST000385_processed.tsv')

In [3]:
# Function to incorporate 10% missing values
def add_missing_values(df):
    exclude_cols = ['SampleID', 'class']
    cols = [c for c in df.columns if c not in exclude_cols]
    mask = pd.DataFrame(np.random.rand(df.shape[0], len(cols)) < 0.1, columns=cols)
    df[cols] = df[cols].mask(mask)
    return df

In [4]:
# Function to Downsample a class
def downsample(df, class_):
    keep_fraction = 0.9
    healthy_rows = df[df['class'] == class_].sample(frac=keep_fraction, random_state=42)
    non_healthy_rows = df[df['class'] != class_]
    df_subset = pd.concat([healthy_rows, non_healthy_rows], ignore_index=True)
    return df_subset

In [5]:
# Preprocess ST000385 dataset
parse_mwTab("ST000385_AN000620.txt", "ST000385_processed.tsv")

Starting to parse 'ST000385_AN000620.txt'...
Parsed 192 total samples and 152 features.

Successfully generated 'ST000385_processed.tsv'.
Total rows written: 172 (Filtered out 20 'NA' samples).
Total columns: 154


In [6]:
# Preprocess ST000386 dataset
parse_mwTab("ST000386_AN000621.txt", "ST000386_processed.tsv")

Starting to parse 'ST000386_AN000621.txt'...
Parsed 180 total samples and 181 features.

Successfully generated 'ST000386_processed.tsv'.
Total rows written: 162 (Filtered out 18 'NA' samples).
Total columns: 183


In [7]:
# Read processed datasets
data1 = pd.read_csv("ST000385_processed.tsv", sep="\t")
data2 = pd.read_csv("ST000386_processed.tsv", sep="\t")

In [8]:
# Combine dataset
final_data = pd.concat([data1, data2], axis=0, join='inner')

# Remove samples with Adenosquamous and Adenocarcnoma (very less samples)
final_data = final_data[final_data["class"].isin(["Adenosquamous", "Adenocarcnoma"])==False]

In [9]:
# Add missing values 
final_data = add_missing_values(final_data)

In [10]:
# Perform downsampling of Adenocarcinoma class
final_data = downsample(final_data, class_="Healthy")

In [11]:
# Write final data
final_data = final_data.sample(frac=1).reset_index(drop=True)
final_data.to_csv("metabolomics_data.csv", quoting=False, index=False)

In [12]:
# Final data shape
final_data.shape

(307, 139)