# Analyse results obtained from MAST webserver for SAUR cohort

## Read data

In [1]:
import os
from pathlib import Path

import xml.etree.ElementTree as ET

import pandas as pd


data = []

baseDir = Path(os.environ['MEME_SUITE_BASE'], 'data', 'output', 'saur_20250212_1700')
for cohortDir in os.listdir(baseDir):
    if cohortDir.endswith('_cohort'):
        cohort = cohortDir.replace('_cohort', '')
        for representationTypeDir in os.listdir(Path(baseDir, cohortDir)):
            representationType = representationTypeDir.replace('_represented', '')
            for tokenDir in os.listdir(Path(baseDir, cohortDir, representationTypeDir)):
                token = tokenDir
                # with open(Path(baseDir, cohortDir, representationTypeDir, tokenDir, ))
                tree = ET.parse(Path(baseDir, cohortDir, representationTypeDir, tokenDir, 'mast.xml'))
                for sequences in tree.getroot()[8]:
                    name = sequences.attrib['name']
                    comment = sequences.attrib['comment']
                    length = sequences.attrib['length']
                    strand = sequences[0].attrib['strand']
                    combined_pvalue = sequences[0].attrib['combined_pvalue']
                    evalue = sequences[0].attrib['evalue']
                    data.append([cohort, representationType, token, name, comment, length, strand, combined_pvalue, evalue])

df = pd.DataFrame(data, columns=['cohort', 'representation_type', 'token', 'name', 'comment', 'length', 'strand', 'combined_pvalue', 'evalue'])
df

Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue
0,high_score,under,TTCCTCA,SAB_RS00745|SAB_RS00745,SAB_RS00745; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
1,high_score,under,TTCCTCA,SAB_RS01645|SAB_RS01645,SAB_RS01645; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
2,high_score,under,TTCCTCA,SAB_RS03485|SAB_RS03485,SAB_RS03485; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
3,high_score,under,TTCCTCA,SAB_RS04145|SAB_RS04145,SAB_RS04145; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
4,high_score,under,TTCCTCA,SAB_RS04290|SAB_RS04290,SAB_RS04290; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
...,...,...,...,...,...,...,...,...,...
857,high_score,over,CGCCC,SAB_RS05720|SAB_RS05720,SAB_RS05720; upstream from -27 to 200; size: 2...,228,both,3.51e-02,98
858,high_score,over,CGCCC,SAB_RS05120|SAB_RS05120,SAB_RS05120; upstream from -28 to 200; size: 2...,229,both,3.53e-02,98
859,high_score,over,CGCCC,SAB_RS13835|SAB_RS13835,SAB_RS13835; upstream from -28 to 200; size: 2...,229,both,3.53e-02,98
860,high_score,over,CGCCC,SAB_RS00925|SAB_RS00925,SAB_RS00925; upstream from -29 to 200; size: 2...,230,both,3.54e-02,99


In [2]:
df[df.cohort == 'high_score']

Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue
0,high_score,under,TTCCTCA,SAB_RS00745|SAB_RS00745,SAB_RS00745; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
1,high_score,under,TTCCTCA,SAB_RS01645|SAB_RS01645,SAB_RS01645; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
2,high_score,under,TTCCTCA,SAB_RS03485|SAB_RS03485,SAB_RS03485; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
3,high_score,under,TTCCTCA,SAB_RS04145|SAB_RS04145,SAB_RS04145; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
4,high_score,under,TTCCTCA,SAB_RS04290|SAB_RS04290,SAB_RS04290; upstream from 0 to 200; size: 201...,201,both,1.98e-02,55
...,...,...,...,...,...,...,...,...,...
857,high_score,over,CGCCC,SAB_RS05720|SAB_RS05720,SAB_RS05720; upstream from -27 to 200; size: 2...,228,both,3.51e-02,98
858,high_score,over,CGCCC,SAB_RS05120|SAB_RS05120,SAB_RS05120; upstream from -28 to 200; size: 2...,229,both,3.53e-02,98
859,high_score,over,CGCCC,SAB_RS13835|SAB_RS13835,SAB_RS13835; upstream from -28 to 200; size: 2...,229,both,3.53e-02,98
860,high_score,over,CGCCC,SAB_RS00925|SAB_RS00925,SAB_RS00925; upstream from -29 to 200; size: 2...,230,both,3.54e-02,99


In [3]:
df[df.cohort == 'low_score']

Unnamed: 0,cohort,representation_type,token,name,comment,length,strand,combined_pvalue,evalue


## Save results

In [4]:
import os
from pathlib import Path


baseDir = Path(os.environ['MEME_SUITE_BASE'], 'data', 'output', 'saur_20250212_1700')

df.to_csv(Path(baseDir, 'summary.csv'), index=False)

## Analysis

## Number of unique tokens with hits

In [5]:
df.token.unique()

array(['TTCCTCA', 'CCTGCCTT', 'CCTCTTC', 'TTTCTTTTTT', 'CAGTGTT',
       'TGACTTTT', 'TCAGTCT', 'GTGATCTG', 'CCCATG', 'GTTCCC', 'CTTCTTTC',
       'CATCCTT', 'AGCAGCA', 'ACACATACA', 'TGTTTTCT', 'CCCAAATA',
       'TCTCCCA', 'CGCCC'], dtype=object)

In [6]:
df[['cohort', 'representation_type', 'token', 'name']].groupby(by=['cohort', 'representation_type', 'token']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name
cohort,representation_type,token,Unnamed: 3_level_1
high_score,over,ACACATACA,52
high_score,over,AGCAGCA,91
high_score,over,CCCAAATA,28
high_score,over,CGCCC,72
high_score,over,TCTCCCA,55
high_score,over,TGTTTTCT,48
high_score,under,CAGTGTT,44
high_score,under,CATCCTT,48
high_score,under,CCCATG,43
high_score,under,CCTCTTC,71


## Gene names and counts

In [7]:
df[df.cohort == 'high_score'].name.value_counts().reset_index().sort_values(by=['count'], ascending=False)

Unnamed: 0,name,count
4,SAB_RS01070|SAB_RS01070,3
3,SAB_RS06615|SAB_RS06615,3
1,SAB_RS07995|aspS,3
0,SAB_RS09820|SAB_RS09820,3
5,SAB_RS09715|SAB_RS09715,3
...,...,...
321,SAB_RS01725|SAB_RS01725,1
322,SAB_RS11345|SAB_RS11345,1
323,SAB_RS06060|SAB_RS06060,1
324,SAB_RS01305|SAB_RS01305,1


In [8]:
df[df.cohort == 'low_score'].name.value_counts().reset_index().sort_values(by=['count'], ascending=False)

Unnamed: 0,name,count


In [9]:
df.name.value_counts().reset_index().sort_values(by=['count'], ascending=False)

Unnamed: 0,name,count
4,SAB_RS01070|SAB_RS01070,3
3,SAB_RS06615|SAB_RS06615,3
1,SAB_RS07995|aspS,3
0,SAB_RS09820|SAB_RS09820,3
5,SAB_RS09715|SAB_RS09715,3
...,...,...
321,SAB_RS01725|SAB_RS01725,1
322,SAB_RS11345|SAB_RS11345,1
323,SAB_RS06060|SAB_RS06060,1
324,SAB_RS01305|SAB_RS01305,1


In [10]:
df[df.cohort == 'high_score'].name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count
0,1,628
1,2,105
2,3,8


In [11]:
df[df.cohort == 'low_score'].name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count


In [12]:
df.name.value_counts().reset_index().rename(columns={'count': 'gene_count'})['gene_count'].value_counts().reset_index()

Unnamed: 0,gene_count,count
0,1,628
1,2,105
2,3,8
