# Make master metabolic tables
This notebook can be used to create the metabolic tables of each gut fungus (used to assign gene reaction rules).
This matches genes to annotation data. The output is {fungus short name}.tab

| Fungus name | Short name |
| --- | --- |
|Neocallimastix lanati | Neosp3 |
|Neocallimastix californiae | Neosp1 |
|Piromyces finnis | Pirfi3 |
|Anaeromyces robustus | Anasp1 |
|Piromyces sp. E2 | PirE2 |

In [1]:
include(joinpath("..", "src", "GSM.jl"))
import .GSM
using BioSequences

In [2]:
uniprotbase = joinpath("..", "OmicsData", "Uniprot") # assign locations
genomicsbase = joinpath("..","OmicsData", "Genomics")
transcriptomicsbase = joinpath("..", "OmicsData", "Transcriptomics")

fungus = "Neosp3" # change this for each fungus
uniprot = GSM.readuniprot(joinpath(uniprotbase,  "uniprot-reviewed_yes.tab")) # import Uniprot database

Dict{Any,Any} with 562755 entries:
  "Q58660" => SubString{String}["Q58660", "6.3.5.3", "SUBUNIT: Monomer. Part of…
  "P67952" => SubString{String}["P67952", "", ""]
  "B4TRK5" => SubString{String}["B4TRK5", "1.7.2.2", ""]
  "P45552" => SubString{String}["P45552", "", ""]
  "Q6GIH2" => SubString{String}["Q6GIH2", "2.8.1.7", ""]
  "Q72HP1" => SubString{String}["Q72HP1", "2.8.1.4", ""]
  "P30085" => SubString{String}["P30085", "2.7.4.14; 2.7.4.6", "SUBUNIT: Monome…
  "Q28UW7" => SubString{String}["Q28UW7", "", "SUBUNIT: Monomer. {ECO:0000255|H…
  "Q6CG11" => SubString{String}["Q6CG11", "2.7.1.26", ""]
  "Q0KD78" => SubString{String}["Q0KD78", "", ""]
  "P17121" => SubString{String}["P17121", "", ""]
  "Q0CIL2" => SubString{String}["Q0CIL2", "2.7.1.17", ""]
  "C5BF12" => SubString{String}["C5BF12", "1.1.1.25", "SUBUNIT: Homodimer. {ECO…
  "Q3KFR5" => SubString{String}["Q3KFR5", "2.4.2.21", ""]
  "Q1LZF3" => SubString{String}["Q1LZF3", "", ""]
  "B1IGZ7" => SubString{String}["B1IGZ7", "3.1

In [54]:
aafilename = [x for x in readdir(joinpath(genomicsbase, fungus)) if occursin(".aa.", x)][1]
aaseqs = GSM.readall(joinpath(genomicsbase, fungus, aafilename), true)

# Read gene <-> EC (kegg) file
# protein ids => ec number
ecfile = [path for path in readdir(joinpath(genomicsbase, fungus)) if occursin("_KEGG.tab", path)][1] # JGI annotations
ecdict = GSM.getECannos(joinpath(genomicsbase, fungus, ecfile)) # protein id => single EC

Dict{Any,Any} with 2761 entries:
  "1047874" => "3.1.3.43"
  "1620923" => "3.6.3.8"
  "1683640" => "6.3.2.19"
  "1434712" => "3.4.24.-"
  "1433424" => "3.5.3.4"
  "1421468" => "2.7.7.7"
  "1305814" => "3.2.1.51"
  "1434448" => "2.3.2.5"
  "1743837" => "3.4.24.56"
  "1689749" => "6.3.2.19"
  "1727594" => "2.3.1.9"
  "1305965" => "2.6.1.2"
  "84182"   => "1.2.1.12"
  "1024431" => "4.6.1.1"
  "1070066" => "6.3.2.19"
  "1706103" => "3.1.4.11"
  "1294756" => "2.7.1.107"
  "1689833" => "6.3.2.19"
  "1634555" => "2.4.1.16"
  "1027070" => "3.5.2.9"
  "1075216" => "3.4.25.1"
  "1300847" => "4.2.2.2"
  "1279593" => "2.4.2.14"
  "1634035" => "3.4.21.61"
  "1636030" => "6.3.2.19"
  ⋮         => ⋮

In [55]:
# Read nucleotide blast output checking if transcript was detected (query = transcript, db = predicted genes) evalue & coverage criteria apply!
# protein id => transcripts
if fungus in ["Anasp1", "Neosp1", "Pirfi3", "Caecom1", "Neosp3"] # one directional hits i.e. is there some transcriptomic evidence?
    transcriptomic_evidence = GSM.getTranscriptomicEvidence(joinpath(transcriptomicsbase, "Results", string(fungus, ".out")))
end

Dict{Any,Any} with 14361 entries:
  "1296960" => SubString{String}["TRINITY_DN121_c1_g1_i2.p1", "TRINITY_DN121_c1…
  "1420380" => SubString{String}["TRINITY_DN11528_c0_g2_i1.p1", "TRINITY_DN1152…
  "1659200" => SubString{String}["TRINITY_DN978_c3_g1_i4.p1"]
  "1297476" => SubString{String}["TRINITY_DN1534_c2_g1_i1.p1", "TRINITY_DN4216_…
  "1727543" => SubString{String}["TRINITY_DN1578_c0_g2_i1.p1"]
  "1716928" => SubString{String}["TRINITY_DN3881_c0_g1_i1.p1", "TRINITY_DN3881_…
  "1676042" => SubString{String}["TRINITY_DN1891_c0_g2_i1.p1"]
  "1693085" => SubString{String}["TRINITY_DN350_c17_g1_i1.p1"]
  "958981"  => SubString{String}["TRINITY_DN1623_c0_g1_i1.p1"]
  "206077"  => SubString{String}["TRINITY_DN5183_c1_g1_i1.p1"]
  "1024431" => SubString{String}["TRINITY_DN1478_c1_g1_i2.p1", "TRINITY_DN3881_…
  "1070066" => SubString{String}["TRINITY_DN1393_c0_g1_i1.p1", "TRINITY_DN1825_…
  "1305839" => SubString{String}["TRINITY_DN20649_c0_g1_i1.p1"]
  "1689833" => SubString{String}["TRINI

In [59]:
# Get expression levels - only works for N. lanati (expression on cellobiose)
if fungus == "Neosp3"
    expdata_s3 = GSM.readexpressiondataS3(joinpath("..", "OmicsData", "Transcriptomics", "S3-de-novo-diffexp", "s3tpms.csv"), "M2")
end

Dict{Any,Any} with 23508 entries:
  "TRINITY_DN3153_c4_g1"  => 0.0
  "TRINITY_DN3309_c1_g1"  => 4.42438
  "TRINITY_DN15616_c1_g2" => 0.169714
  "TRINITY_DN2367_c5_g1"  => 0.103633
  "TRINITY_DN1167_c0_g1"  => 3.26975
  "TRINITY_DN3802_c1_g1"  => 4.52767
  "TRINITY_DN23810_c0_g1" => 10.2228
  "TRINITY_DN1041_c8_g1"  => 0.415954
  "TRINITY_DN10849_c3_g1" => 0.0634547
  "TRINITY_DN112_c0_g1"   => 284.323
  "TRINITY_DN28886_c4_g2" => 0.0841612
  "TRINITY_DN8875_c1_g1"  => 0.672092
  "TRINITY_DN3091_c1_g1"  => 0.970452
  "TRINITY_DN11853_c0_g2" => 0.0977764
  "TRINITY_DN1840_c0_g1"  => 6.56524
  "TRINITY_DN2489_c0_g2"  => 1.36729
  "TRINITY_DN334_c1_g2"   => 1.53306
  "TRINITY_DN58_c0_g2"    => 7.36782
  "TRINITY_DN814_c3_g1"   => 6.66907
  "TRINITY_DN26524_c1_g1" => 1.14712
  "TRINITY_DN34547_c0_g1" => 0.0
  "TRINITY_DN11790_c0_g1" => 0.0998832
  "TRINITY_DN854_c2_g2"   => 0.137639
  "TRINITY_DN3823_c1_g1"  => 0.134434
  "TRINITY_DN15448_c0_g2" => 0.0252932
  ⋮                       => ⋮

In [57]:
# # Read bidirectional blast output (predicted proteins <=> UNIPROT reviewed proteins) evalue cutoff applies
fungus_to_uniprot_loc = joinpath(uniprotbase, "Results", string(fungus, "_to_uni.out"))
uniprot_to_fungus_loc = joinpath(uniprotbase, "Results", string("uni_to_$(fungus).out"))
bidir = GSM.matchbidir(fungus_to_uniprot_loc, uniprot_to_fungus_loc, 1e-20, 11) # protein id => uniprot entry id # NB change to 11 => 3 for genome
bidirdesc = Dict()
for (k, v) in bidir
    vlook = split(v, "|")[2] # remove for genome
    bidirdesc[k] = uniprot[vlook] # change vlook > v for genome
end

In [62]:
# Write output to csv file => Master Metabolic Annotation tables
open(joinpath(string(fungus, ".tab")), "w") do f
    if fungus == "Neosp3"
        headings = ["Protein ID", "Transcribed", "Expression [TPM]", "JGI EC", "Bidir EC", "Bidir Subunit", "Uniprot hit"]
    elseif fungus in ["Anasp1", "Neosp1", "Pirfi3", "Caecom1"]
        headings = ["Protein ID", "Transcribed", "JGI EC", "Bidir EC", "Bidir Subunit", "Uniprot hit"]
    else
        headings = ["Protein ID", "JGI EC", "Bidir EC", "Bidir Subunit", "Uniprot hit"]
    end
    write(f, join(headings, "\t"), "\n")

    for proteinid in keys(bidirdesc)
        ec = get(ecdict, proteinid, "")
        bidir_both = get(bidirdesc, proteinid, ["", "", ""])
        bidir_ecs = bidir_both[2]
        bidir_subunit = bidir_both[3]
        desc = bidir_both[1]

        if fungus in ["Anasp1", "Neosp1", "Pirfi3", "Caecom1", "Neosp3"]
            transcripts = get(transcriptomic_evidence, proteinid, [])
            if isempty(transcripts)
                transcriptdetected = "N"
            else
                transcriptdetected = "Y"
            end
        end

        if fungus == "Neosp3"
            if haskey(transcriptomic_evidence, proteinid)
                ts = String[]
                for t in transcriptomic_evidence[proteinid]
                    tid = join(split(t, "_")[1:end-1], "_")
                    push!(ts, tid)
                end
                el = 0.0 # sum all isozymes
                for t in unique(ts)
                    el += get(expdata_s3, t, 0)
                end
                expressionlevel = string(el)
            else
                expressionlevel = ""
            end
            write(f, join([proteinid, transcriptdetected, expressionlevel, ec, bidir_ecs, bidir_subunit, desc], "\t"), "\n")
        elseif fungus in ["Anasp1", "Neosp1", "Pirfi3", "Caecom1"]
            write(f, join([proteinid, transcriptdetected, ec, bidir_ecs, bidir_subunit, desc], "\t"), "\n")
        else
            write(f, join([proteinid, ec, bidir_ecs, bidir_subunit, desc], "\t"), "\n")
        end
    end
end