# Goal

* Create a phyloseq R object for downstream analyses

# Setting variables

In [7]:
import os
baseDir = '/home/seq_data/fullCyc2/amplicon/515F-806R/final_dataset'
OTUDir = '/home/seq_data/fullCyc2/amplicon/515F-806R/final_dataset/OTU_binning'
biomFile = os.path.join(OTUDir, 'otu_table.biom')

metadataFile = os.path.join(baseDir, 'FullCyc2_master_metadata.txt')

taxonomyDir = os.path.join(OTUDir, 'otusn_tax')
taxonomyFile = os.path.join(taxonomyDir, 'otusn_tax_assignments.txt')

workDir = os.path.join(baseDir, 'phyloseq')
outphyseq = 'fullcyc2physeq.RDS'

# Init

In [2]:
import json
import pandas as pd


In [3]:
%load_ext rpy2.ipython
#%load_ext pushnote

In [4]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)
library(phyloseq)

Attaching package: ‘dplyr’



    filter, lag



    intersect, setdiff, setequal, union




In [5]:
if not os.path.isdir(workDir):
    print("Working directory does not exist. Making it now:")
    os.makedirs(workDir)
%cd $workDir

/home/seq_data/fullCyc2/amplicon/515F-806R/final_dataset/phyloseq


# Loading files

In [8]:
%%R -i biomFile -i metadataFile

## biom file
physeq = import_biom(biomFile)

## loading sample metadata
#read.table(metadataFile)
sample.data = import_qiime_sample_data(metadataFile)
physeq = merge_phyloseq(physeq,sample.data)
physeq.m = physeq %>% sample_data

print(physeq)
head(sample.data)

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 30808 taxa and 2295 samples ]
sample_data() Sample Data:       [ 2295 samples by 38 sample variables ]
Sample Data:        [6 samples by 38 sample variables]:
                                             X.Sample primer_number   exp_type
EL.A.151026                               EL.A.151026           129       bulk
CC.M.151027.12C-Van.D02       CC.M.151027.12C-Van.D02           129 Enrichment
MF.M.151026.12C-Ami.D02       MF.M.151026.12C-Ami.D02           129 Enrichment
MR.M.13C-Xyl.D3.R1_Frac21   MR.M.13C-Xyl.D3.R1_Frac21           129        SIP
MR.F.13C-Cel.D30.R3_Frac11 MR.F.13C-Cel.D30.R3_Frac11           129        SIP
MR.M.13C-Ami.D6.R1_Frac26   MR.M.13C-Ami.D6.R1_Frac26           129        SIP
                           extraction_plate_ID       library_ID
EL.A.151026                           14012016    Chantal_Pool9
CC.M.151027.12C-Van.D02            Enr2_160509   fullCyc2_lib14
MF.M.151026.12C-Ami.D0

In [9]:
%%R -i taxonomyFile
# Add taxonomy
tax.df = read.csv(taxonomyFile, sep="\t", header=F) %>%
    select(V1, V2) %>%
    rename(OTU = V1) %>%
    mutate(V2 = gsub("D_.__", "", V2)) %>%
    separate(V2, into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep=";")
rownames(tax.df) = tax.df$OTU
tax.df$OTU = NULL
tax.df = tax_table(as.matrix(tax.df))
tax_table(physeq) = tax.df
physeq

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 30808 taxa and 2295 samples ]
sample_data() Sample Data:       [ 2295 samples by 38 sample variables ]
tax_table()   Taxonomy Table:    [ 30808 taxa by 7 taxonomic ranks ]


In [56]:
%%R -i workDir -i outphyseq
outFile = file.path(workDir, outphyseq)
saveRDS(physeq, outFile)