# Save results obtained from MAST webserver to a csv file

## Read data

In [None]:
import os
from pathlib import Path

import xml.etree.ElementTree as ET

import pandas as pd


data = []

baseDir = Path(os.environ['MEME_SUITE_BASE'], 'data', 'output', '20250130_1130')
for cohortDir in os.listdir(baseDir):
    cohort = cohortDir.replace('_cohort', '')
    for representationTypeDir in os.listdir(Path(baseDir, cohortDir)):
        representationType = representationTypeDir.replace('_represented', '')
        for tokenDir in os.listdir(Path(baseDir, cohortDir, representationTypeDir)):
            token = tokenDir
            # with open(Path(baseDir, cohortDir, representationTypeDir, tokenDir, ))
            tree = ET.parse(Path(baseDir, cohortDir, representationTypeDir, tokenDir, 'mast.xml'))
            for sequences in tree.getroot()[8]:
                name = sequences.attrib['name']
                comment = sequences.attrib['comment']
                length = sequences.attrib['length']
                strand = sequences[0].attrib['strand']
                combined_pvalue = sequences[0].attrib['combined_pvalue']
                evalue = sequences[0].attrib['evalue']

                data.append([cohort, representationType, token, name, comment, length, strand, combined_pvalue, evalue])

df = pd.DataFrame(data, columns=['cohort', 'representation_type', 'token', 'name', 'comment', 'length', 'strand', 'combined_pvalue', 'evalue'])
df

representationType under
representationType over


Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue
0,high_score,over,TGATTTTT,b0004|thrC,b0004; upstream from 0 to 200; size: 201; feat...,201,both,8.90e-03,40
1,high_score,over,TGATTTTT,b0153|fhuB,b0153; upstream from 0 to 200; size: 201; feat...,201,both,8.90e-03,40
2,high_score,over,TGATTTTT,b0337|codA,b0337; upstream from 0 to 200; size: 201; feat...,201,both,8.90e-03,40
3,high_score,over,TGATTTTT,b4509|ylcG,b4509; upstream from 0 to 200; size: 201; feat...,201,both,8.90e-03,40
4,high_score,over,TGATTTTT,b0716|ybgO,b0716; upstream from 0 to 200; size: 201; feat...,201,both,8.90e-03,40
...,...,...,...,...,...,...,...,...,...
384,high_score,over,TCCTTTCC,b4055|aphA,b4055; upstream from -401 to 200; size: 602; f...,602,both,1.76e-02,79
385,high_score,over,TCCTTTCC,b0750|nadA,b0750; upstream from -432 to 200; size: 633; f...,633,both,1.85e-02,83
386,high_score,over,TCCTTTCC,b4440|ryfA,b4440; upstream from -515 to 200; size: 716; f...,716,both,2.10e-02,94
387,high_score,over,TCCTTTCC,b0112|aroP,b0112; upstream from -540 to 200; size: 741; f...,741,both,2.17e-02,98


## Save results

In [46]:
import os
from pathlib import Path


baseDir = Path(os.environ['MEME_SUITE_BASE'], 'data', 'output', '20250130_1130')

df.to_csv(Path(baseDir, 'summary.csv'), index=False)

## Analysis

## Number of unique tokens with hits

In [47]:
df.token.unique()

array(['TGATTTTT', 'ATTTCCTT', 'AAGAAAAAAA', 'TCCTTTCC'], dtype=object)

## Gene names and counts

In [49]:
df.name.value_counts().reset_index().sort_values(by=['count'], ascending=False)

Unnamed: 0,name,count
8,b1742|ves,2
10,b3407|yhgF,2
9,b4593|ymgI,2
5,b2845|yqeG,2
4,b3187|ispB,2
...,...,...
128,b4410|ecnA,1
127,b1079|flgH,1
126,b1803|yeaX,1
125,b1158|pinE,1


In [52]:
df.name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count
0,1,359
1,2,15
