In [1]:
import import_ipynb
import part3_dataFilter
import part1_duplicateAnalysis
import pandas as pd

/ceph/projects/179_Oncdon/shawn.loo/workspace/notebooks
size of raw main dataset is:  (4624, 440)
size of raw dataset V107 is:  (4624, 302)
size of raw dataset V142 is:  (4624, 89)
size of raw dataset E975 is:  (4624, 51)
(4624, 144)
(4624, 146)
(4624, 42)
(4624, 42)
(4624, 48)


In [2]:
V107_sup_df = part3_dataFilter.V107_sup_df
V142_fece_df = part3_dataFilter.V142_fece_df
E975_br_df = part3_dataFilter.E975_br_df
metadata_set = part1_duplicateAnalysis.metadata_sets

dataset = [V107_sup_df, V142_fece_df, E975_br_df]

In [21]:
longest_string = min(dataset[1]["taxon_name"], key=len)
print(longest_string)
print(len(longest_string))

k__Archaea
10


# Microbiome Filtering

We will be performing filtering for these three dataset accessable via:  
`dataset["V350218107"]`  
`dataset["V350218142"]`  
`dataset["E100051975"]`

In microbiology analysis, this can be easily separated into such categories

1. Taxonomic Filtering
2. Prevalence Filtering
3. Low Abundance Filtering

## Taxonomic Filtering

Filters data taxa to specific level.

Done via regex filtering

In [3]:
def taxo_filter(df, taxa_class = "p"):
    
    # Setting pattern
    pattern = f"/|{taxa_class}_"
    
    # Filter to species level by default
    return df[df["taxon_name"].str.contains(taxa_class)]

## Prevalance Filtering

Taxa that appears less than 25% of all samples are **removed**.

This removes incosistent rare taxa.

Done via panda's dataset handling

In [4]:
def prevalence_filter(df, threshold = 0.25):

    # Getting total taxa across sample
    prevalence = (df.iloc[:,1:] > 0).sum(axis = 1)

    # Retrieving total samples
    total_samples = df.iloc[:,1:].shape[1]

    # Setting filter threshold at 25%
    filter_threshold = threshold * total_samples

    # Filter
    filtered_df = df[prevalence >= filter_threshold]

    return filtered_df

## Low Abundance Filtering

Taxa that appears in individual samples that are less than x proportions of overall taxa in the sample are **removed**

This also removed incosistent rare taxa.

Done via panda's dataset handling

In [5]:
def abundance_filter(df, proportion = 0.01):

    # Retrieving taxa-wise total abundance
    rowwise_abd = df.iloc[:, 1:].sum(axis=1)

    # Getting lowest 10%
    threshold_abd = rowwise_abd.quantile(proportion)

    # Filtering it
    cleaned_data = df[rowwise_abd >= threshold_abd]

    return cleaned_data

## Wrapper

The entire function above is built into a singlar function.

In [6]:
def filtering(df, taxa_class = "p", threshold = 0.001, proportion = 0.005):
    

    starting_count = df.shape[0]
    print(f"Starting size is {starting_count}")
    
    # Taxonomic Filtering
    # Setting pattern
    pattern = f"/|{taxa_class}_"

    # Filter to species level by default
    df = df[df["taxon_name"].str.contains(pattern)]

    taxon_filtered_count = df.shape[0]
    taxon_filter_count = starting_count - taxon_filtered_count
    
    print(f"Total rows removed for taxon filtering: {taxon_filter_count}")
    
    # Prevalence Filtering
    # Getting total taxa across sample
    prevalence = (df.iloc[:,1:] > 0).sum(axis = 1)

    # Retrieving total samples
    total_samples = df.iloc[:,1:].shape[1]

    # Setting filter threshold at 25%
    filter_threshold = threshold * total_samples

    # Filter
    df = df[prevalence >= filter_threshold]

    prevalence_filtered_count = df.shape[0]
    prevalence_filter_count = taxon_filtered_count - prevalence_filtered_count
    
    print(f"Total rows removed for prevalence filtering: {prevalence_filter_count}")
    
    # Low Abundance Filtering
    # Retrieving taxa-wise total abundance
    rowwise_abd = df.iloc[:, 1:].sum(axis=1)

    # Getting lowest 10%
    threshold_abd = rowwise_abd.quantile(proportion)

    # Filtering it
    cleaned_data = df[rowwise_abd >= threshold_abd]
    
    final_count = cleaned_data.shape[0]
    total_removed = prevalence_filtered_count - final_count
    print(f"Total rows removed for abundance filtering: {total_removed}")
    
    print(f"The final rows remaining are: {final_count}")
    
    return cleaned_data

In [7]:
# Applying filter
flowcell_ids = ["V107", "V142", "E975"]
flowcell_df = {}

for i in range(len(flowcell_ids)):
    
    dataset[i] = dataset[i].reset_index()
    flowcell_df[flowcell_ids[i]] = filtering(dataset[i])


Starting size is 4624
Total rows removed for taxon filtering: 3
Total rows removed for prevalence filtering: 1562
Total rows removed for abundance filtering: 15
The final rows remaining are: 3044
Starting size is 4624
Total rows removed for taxon filtering: 3
Total rows removed for prevalence filtering: 2062
Total rows removed for abundance filtering: 12
The final rows remaining are: 2547
Starting size is 4624
Total rows removed for taxon filtering: 3
Total rows removed for prevalence filtering: 547
Total rows removed for abundance filtering: 20
The final rows remaining are: 4054


#### Level Filtering

Here we create a function to filter our dataset based on what level we want. We will be using **phylum** level for now to get a general picture of the microbiome composition.

We would also want to assign our `taxon_name` as index to ease our downstream process.

In [8]:
def level_filtering(filtering, df):
    
    # Level Map
    level_map = {
        "kingdom": "k__",
        "phylum": "p__",
        "class": "c__",
        "order": "o__",
        "family": "f__",
        "genus": "g__",
        "species": "s__"
    }
    
    level = level_map.get(filtering.lower())
    
    # Perform filtering based on passed in parameter mapped to Level Map and name the column
    df["taxon_name"] = df["taxon_name"].str.extract(fr'{level}([^|]+)')
    
    # Dropping NaN columns
    filtered_df = df.dropna(subset=["taxon_name"])
    
    # Group up the rows
    result = filtered_df.groupby("taxon_name").sum()
    
    return result

In [9]:
level_filtered_df = {}

for flowcell_id, data in flowcell_df.items():
    
    level_filtered_df[flowcell_id] = level_filtering("phylum", data)

In [10]:
level_filtered_df["V107"]

Unnamed: 0_level_0,179supA00011a,179supA00012a,179supA00013a,179supA00014a,179supA00015a,179supA00016a,179supA00021a,179supA00022a,179supA00023a,179supA00024a,...,179supD00619a,179supD00620a,179supD00621a,179supD00622a,FF02556036,FF02556045,FF02556056,FF02556063,FF02556074,FF02556082
taxon_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Actinobacteria,139.33832,17.75688,57.40064,66.67967,59.92014,41.4121,53.9798,60.00328,81.93832,64.95976,...,20.46947,0.0,29.57402,21.8806,186.9583,42.07434,160.1147,84.37029,82.2412,31.03692
Bacillota,0.96866,0.0,0.87843,0.0,0.11431,0.27895,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.95326,0.0,0.59815,1.77856,0.29806,0.91787
Bacteria_unclassified,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.04935,0.05908,0.0,0.03962,0.0,0.0434
Bacteroidota,41.47697,82.54599,71.25377,119.44721,37.33268,44.14071,106.1726,105.83091,176.07432,128.71435,...,24.85133,0.0,98.13083,37.4815,62.51763,53.4642,113.92636,156.12505,47.05514,48.78449
Candidatus_Melainabacteria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Euryarchaeota,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.71647,4.89615
Firmicutes,425.20088,440.51066,479.49896,315.58971,534.23975,562.76773,520.58996,492.78964,396.49513,467.32163,...,443.4791,700.0,416.87546,239.04025,283.01856,389.95076,293.25592,321.70049,284.32779,367.073
Lentisphaerae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Proteobacteria,71.19931,148.96325,77.96613,181.51616,54.56997,27.06305,3.46101,32.62042,19.04809,31.16208,...,172.22695,0.0,111.12451,332.9991,65.05415,119.39585,55.52279,33.84425,133.14798,93.64663
Synergistetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.19548,0.0,0.0,0.0,0.0


## Grouping data based on weeks

Now that our data is all separated based on the flowcell id (eg: V350218107) and filtered accordingly. We would want to further group them into their respective week counter.

This allows us to gain more insights on how the absolute abundances of microorganisms change overtime since its initial fecal sample.

In [11]:
samples_by_week = {}

for idx in range(len(metadata_set)):
    
    # Getting rows in metadata based on the sample_names in our current data
    filtered_df = metadata_set[idx][metadata_set[idx]["sample_id"].isin(level_filtered_df[flowcell_ids[idx]].columns)]
    
    # Extracting needed columns
    filtered_df = filtered_df[["sample_id", "sample_description"]]
    
    # Grouping them based on weeks
    filtered_df = filtered_df.groupby("sample_description")["sample_id"].apply(list).to_dict()

    # Storing
    samples_by_week[flowcell_ids[idx]] = filtered_df
    

In [12]:
# Handling duplicates to be included into the same week
samples_by_week["V107"]["cul_wk6_a"].extend(samples_by_week["V107"]["cul_wk6_b"])

del samples_by_week["V107"]["cul_wk6_b"]

# Repeat for other duplicate
samples_by_week["V142"]["cul_wk1_a"].extend(samples_by_week["V142"]["cul_wk1_b"])
del samples_by_week["V142"]["cul_wk1_b"]

In [13]:
samples_by_week["V107"]["cul_wk4_a"]

['179supA00041a',
 '179supA00042a',
 '179supA00043a',
 '179supA00044a',
 '179supA00045a',
 '179supA00046a',
 '179supB00047a',
 '179supB00048a',
 '179supB00049a',
 '179supB00410a',
 '179supB00411a',
 '179supD00419a',
 '179supD00420a',
 '179supD00421a',
 '179supD00422a']

In [14]:
main_dataset = {}

for flowcell, weeks in samples_by_week.items():
    
    main_dataset[flowcell] = {}
    
    for week, sample_id in weeks.items():

        main_dataset[flowcell][week] = level_filtered_df[flowcell][sample_id]

In [15]:
# Resorting duplicates dataset
main_dataset["V107"]["cul_wk6_a"] = main_dataset["V107"]["cul_wk6_a"].reindex(sorted(main_dataset["V107"]["cul_wk6_a"].columns), axis=1)
main_dataset["V142"]["cul_wk1_a"] = main_dataset["V142"]["cul_wk1_a"].reindex(sorted(main_dataset["V142"]["cul_wk1_a"].columns), axis=1)

In [16]:
# Reordering them

# For V107
colnames = list(main_dataset["V107"].items())
reordered = [colnames[-1]] + colnames[:-1]
main_dataset["V107"] = dict(reordered)

# For E975
E975_order = ['fecal_slurry', 'cul_Day4', 'cul_Day8', 'cul_Day12']
main_dataset["E975"] = {key: main_dataset["E975"][key] for key in E975_order}