In [35]:
import pandas as pd
import numpy as np
import os.path

wd = "$HOME/Bioinformatics_data/lferriphilum/analysis/RNA/00_reads_QC/01_Kraken2/single_sample_raw_reads/"
ext = ".report"

file_names = ["ERR2117291",
              "ERR2117290",
              "ERR2036632",
              "ERR2036631"]

In [40]:
data = {}
header = ["Percentage", "# of reads", "# of reads assigned directly", "Rank code", "Taxa-ID", "Scientific Name"]
dtypes = {"Percentage": np.float, "# of reads": np.int, "# of reads assigned directly": np.int, "Rank code": str, "Taxa-ID": np.int, "Scientific Name": str}

for file in file_names:
    print(f"Reading {file}... ", end="")
    data[file] = pd.read_csv(wd + file + ext, sep="\t", names=header, usecols=["Percentage", "Taxa-ID", "Scientific Name"], dtype=dtypes)
    print("Stripping whitespaces... ", end="")
    data[file]["Scientific Name"] = data[file]["Scientific Name"].str.strip()
    print("Dropping taxa with Percentage < 1%... ", end="")
    data[file] = data[file][data[file]['Percentage'] > 1]
    
    
    #result[result['Value'] > 10]  
    print("DONE!")


Reading ERR2117291... Stripping whitespaces... Dropping taxa with Percentage < 1%... DONE!
Reading ERR2117290... Stripping whitespaces... Dropping taxa with Percentage < 1%... DONE!
Reading ERR2036632... Stripping whitespaces... Dropping taxa with Percentage < 1%... DONE!
Reading ERR2036631... Stripping whitespaces... Dropping taxa with Percentage < 1%... DONE!


In [41]:
data["ERR2117291"]

Unnamed: 0,Percentage,Taxa-ID,Scientific Name
0,13.66,0,unclassified
1,86.34,1,root
2,69.25,131567,cellular organisms
3,68.8,2,Bacteria
4,51.25,40117,Nitrospirae
5,51.25,203693,Nitrospira
6,51.25,189778,Nitrospirales
7,51.25,189779,Nitrospiraceae
8,51.25,179,Leptospirillum
9,12.35,261385,Leptospirillum sp. Group II


In [5]:
data["ERR2117290"]

Unnamed: 0,Percentage,Taxa-ID,Scientific Name
0,19.15,0,unclassified
1,80.85,1,root
2,74.13,131567,cellular organisms
3,73.79,2,Bacteria
4,56.51,40117,Nitrospirae
5,56.51,203693,Nitrospira
6,56.51,189778,Nitrospirales
7,56.51,189779,Nitrospiraceae
8,56.51,179,Leptospirillum
9,13.64,261385,Leptospirillum sp. Group II


In [48]:
# merged = data["ERR2117291"]

merged = None

for label in data:
    print(label)
    if merged is None:
        merged = data[label]
        print(type(merged))
    else:
        merged = pd.merge(merged, data[label], how="outer", on="Taxa-ID")

merged

ERR2117291
<class 'pandas.core.frame.DataFrame'>
ERR2117290
ERR2036632
ERR2036631


Unnamed: 0,Percentage_x,Taxa-ID,Scientific Name_x,Percentage_y,Scientific Name_y,Percentage_x.1,Scientific Name_x.1,Percentage_y.1,Scientific Name_y.1
0,13.66,0,unclassified,19.15,unclassified,13.66,unclassified,19.15,unclassified
1,86.34,1,root,80.85,root,86.34,root,80.85,root
2,69.25,131567,cellular organisms,74.13,cellular organisms,69.25,cellular organisms,74.13,cellular organisms
3,68.8,2,Bacteria,73.79,Bacteria,68.8,Bacteria,73.79,Bacteria
4,51.25,40117,Nitrospirae,56.51,Nitrospirae,51.25,Nitrospirae,56.51,Nitrospirae
5,51.25,203693,Nitrospira,56.51,Nitrospira,51.25,Nitrospira,56.51,Nitrospira
6,51.25,189778,Nitrospirales,56.51,Nitrospirales,51.25,Nitrospirales,56.51,Nitrospirales
7,51.25,189779,Nitrospiraceae,56.51,Nitrospiraceae,51.25,Nitrospiraceae,56.51,Nitrospiraceae
8,51.25,179,Leptospirillum,56.51,Leptospirillum,51.25,Leptospirillum,56.51,Leptospirillum
9,12.35,261385,Leptospirillum sp. Group II,13.64,Leptospirillum sp. Group II,12.35,Leptospirillum sp. Group II,13.64,Leptospirillum sp. Group II
