In [56]:
using CSV
using DataFrames
using XLSX
using Statistics

In [57]:
# load all five CSVs with OTU data
class = CSV.read("./raw-data/SCMP_Y1_class.csv", DataFrame)
family = CSV.read("./raw-data/SCMP_Y1_family.csv", DataFrame)
order = CSV.read("./raw-data/SCMP_Y1_order.csv", DataFrame)
phylum = CSV.read("./raw-data/SCMP_Y1_phylum.csv", DataFrame)
genus = CSV.read("./raw-data/SCMP_Y1_genus.csv", DataFrame)

# remove all diversity values
class = class[:, Not(2:6)]
family = family[:, Not(2:6)]
order = order[:, Not(2:6)]
phylum = phylum[:, Not(2:6)]
genus = genus[:, Not(2:6)]

# Join all otu data by their link_ID
otu = innerjoin(class, family, order, phylum, genus, on = :Link_ID)
@info("Finish loading OTU data...")

┌ Info: Finish loading OTU data...
└ @ Main In[57]:17


In [58]:
# Load the outcomes from the XLSX files
outcome = DataFrame(XLSX.readtable("./raw-data/SCMP_Y1.xlsx", "SCMP_Y1")...)
# keep the ID, variety2, and Yield_per_plant columns
outcome = outcome[:, Not(2:10)]
outcome = outcome[:, Not(3:20)]
outcome = outcome[:,1:3]
@info("Finish loading outcome data for yield_per_plant")

┌ Info: Finish loading outcome data for yield_per_plant
└ @ Main In[58]:7


In [59]:
# Concatnate outcomes and OTUs by ID
yield_per_plant = innerjoin(otu, outcome, on = :Link_ID)
# remove all rows with otu = "NA" (OTU data do not have parital N/A for a row)
yield_per_plant = filter(row -> !(row.Alphaproteobacteria == "NA"), yield_per_plant)
# remove all N/A rows with NA in yield_per_meter
yield_per_plant = dropmissing(yield_per_plant)
yield_per_plant = filter(row -> !(row.Yield_per_plant == "NA"), yield_per_plant)
@info("Finish cleansing the missing data...")

┌ Info: Finish cleansing the missing data...
└ @ Main In[59]:8


In [60]:
# find the median of red_little and yellow_little
sample_size = size(yield_per_plant)[1]
red_little = zeros(0)
yellow_little = zeros(0)
for i in 1:sample_size
    if yield_per_plant[i,2396] == "RedLittle"
        append!(red_little, yield_per_plant[i, 2397])
    end
    if yield_per_plant[i,2396] == "YellowLittle"
        append!(yellow_little, yield_per_plant[i, 2397])
    end 
end

# set the label with respect to the variety2
for i in 1:sample_size
    # threshold: 600
    if yield_per_plant[i,2396] == "Red" || yield_per_plant[i,2396] == "White" || yield_per_plant[i,2396] == "Yellow"
        if yield_per_plant[i,2397] >= 600
            yield_per_plant[i,2397] = 1
        else
            yield_per_plant[i,2397] = 0
        end
    elseif yield_per_plant[i,2396] == "Russet"
        if yield_per_plant[i,2397] >= 900
            yield_per_plant[i,2397] = 1
        else
            yield_per_plant[i,2397] = 0
        end
    elseif yield_per_plant[i,2396] == "RedLittle"
        if yield_per_plant[i,2397] >= median(red_little)
            yield_per_plant[i,2397] = 1
        else
            yield_per_plant[i,2397] = 0
        end
    else 
        if yield_per_plant[i,2397] >= median(yellow_little)
            yield_per_plant[i,2397] = 1
        else
            yield_per_plant[i,2397] = 0
        end
    end
end

In [61]:
# remove variety2 column
yield_per_plant = yield_per_plant[:, Not(2396)]
# Write to CSV
CSV.write("./processed-data/otu-yield-per-plant.csv", yield_per_plant)

"./processed-data/otu-yield-per-plant.csv"