# Analyse results obtained from MAST webserver for ECOLI cohort

## Read data

In [2]:
import os
from pathlib import Path

import xml.etree.ElementTree as ET

import pandas as pd


data = []

baseDir = Path(os.environ['MEME_SUITE_BASE'], 'data', 'output', '20250211_1500')
for cohortDir in os.listdir(baseDir):
    if cohortDir.endswith('_cohort'):
        cohort = cohortDir.replace('_cohort', '')
        for representationTypeDir in os.listdir(Path(baseDir, cohortDir)):
            representationType = representationTypeDir.replace('_represented', '')
            for tokenDir in os.listdir(Path(baseDir, cohortDir, representationTypeDir)):
                token = tokenDir
                # with open(Path(baseDir, cohortDir, representationTypeDir, tokenDir, ))
                tree = ET.parse(Path(baseDir, cohortDir, representationTypeDir, tokenDir, 'mast.xml'))
                for sequences in tree.getroot()[8]:
                    name = sequences.attrib['name']
                    comment = sequences.attrib['comment']
                    length = sequences.attrib['length']
                    strand = sequences[0].attrib['strand']
                    combined_pvalue = sequences[0].attrib['combined_pvalue']
                    evalue = sequences[0].attrib['evalue']
                    data.append([cohort, representationType, token, name, comment, length, strand, combined_pvalue, evalue])

df = pd.DataFrame(data, columns=['cohort', 'representation_type', 'token', 'name', 'comment', 'length', 'strand', 'combined_pvalue', 'evalue'])
df

Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue
0,high_score,under,TGGCGCG,b0021|insB1,b0021; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
1,high_score,under,TGGCGCG,b0027|lspA,b0027; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
2,high_score,under,TGGCGCG,b0087|mraY,b0087; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
3,high_score,under,TGGCGCG,b0094|ftsA,b0094; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
4,high_score,under,TGGCGCG,b0102|zapD,b0102; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
...,...,...,...,...,...,...,...,...,...
280,high_score,over,CGTCTGG,b0927|ycbL,b0927; upstream from -26 to 200; size: 227; fe...,227,both,2.11e-02,95
281,high_score,over,CGTCTGG,b3732|atpD,b3732; upstream from -26 to 200; size: 227; fe...,227,both,2.11e-02,95
282,high_score,over,CGTCTGG,b0916|ycaQ,b0916; upstream from -36 to 200; size: 237; fe...,237,both,2.21e-02,99
283,high_score,over,CGTCTGG,b1650|nemA,b1650; upstream from -36 to 200; size: 237; fe...,237,both,2.21e-02,99


In [3]:
df[df.cohort == 'high_score']

Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue
0,high_score,under,TGGCGCG,b0021|insB1,b0021; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
1,high_score,under,TGGCGCG,b0027|lspA,b0027; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
2,high_score,under,TGGCGCG,b0087|mraY,b0087; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
3,high_score,under,TGGCGCG,b0094|ftsA,b0094; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
4,high_score,under,TGGCGCG,b0102|zapD,b0102; upstream from 0 to 200; size: 201; feat...,201,both,1.62e-02,73
...,...,...,...,...,...,...,...,...,...
280,high_score,over,CGTCTGG,b0927|ycbL,b0927; upstream from -26 to 200; size: 227; fe...,227,both,2.11e-02,95
281,high_score,over,CGTCTGG,b3732|atpD,b3732; upstream from -26 to 200; size: 227; fe...,227,both,2.11e-02,95
282,high_score,over,CGTCTGG,b0916|ycaQ,b0916; upstream from -36 to 200; size: 237; fe...,237,both,2.21e-02,99
283,high_score,over,CGTCTGG,b1650|nemA,b1650; upstream from -36 to 200; size: 237; fe...,237,both,2.21e-02,99


In [4]:
df[df.cohort == 'low_score']

Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue


## Save results

In [6]:
import os
from pathlib import Path


baseDir = Path(os.environ['MEME_SUITE_BASE'], 'data', 'output', '20250211_1500')

df.to_csv(Path(baseDir, 'summary.csv'), index=False)

## Analysis

## Number of unique tokens with hits

In [7]:
df.token.unique()

array(['TGGCGCG', 'CGTCTGG'], dtype=object)

In [8]:
df[['cohort', 'representation_type', 'token', 'name']].groupby(by=['cohort', 'representation_type', 'token']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name
cohort,representation_type,token,Unnamed: 3_level_1
high_score,over,CGTCTGG,112
high_score,under,TGGCGCG,173


## Gene names and counts

In [12]:
df[df.cohort == 'high_score'].name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count
0,1,261
1,2,12


In [13]:
df[df.cohort == 'low_score'].name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count


In [14]:
df.name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count
0,1,261
1,2,12


In [16]:
df[df.cohort == 'high_score'].name.value_counts().reset_index().sort_values(by=['count'], ascending=False)[:12]

Unnamed: 0,name,count
5,b2341|fadJ,2
3,b1316|ycjT,2
1,b3790|wecD,2
0,b3248|yhdE,2
4,b1523|yneG,2
6,b0183|rnhB,2
7,b3952|pflC,2
11,b2907|ubiH,2
8,b0976|hyaE,2
9,b0351|mhpF,2


In [10]:
df[df.cohort == 'low_score'].name.value_counts().reset_index().sort_values(by=['count'], ascending=False)

Unnamed: 0,name,count


In [11]:
df.name.value_counts().reset_index().sort_values(by=['count'], ascending=False)

Unnamed: 0,name,count
5,b2341|fadJ,2
3,b1316|ycjT,2
1,b3790|wecD,2
0,b3248|yhdE,2
4,b1523|yneG,2
...,...,...
94,b2200|ccmB,1
93,b2196|ccmF,1
92,b2195|dsbE,1
91,b2056|wcaD,1
