In [37]:
using CSV
using DataFrames
using XLSX
using Statistics
using Glob

In [38]:
#=
This function removes the unnecessary columns for the input of BNN
This function also writes the processed data into its respective CSV file
=#
function process(otu, res, otu_idx, res_idx, level, dis_feature, yie_feature, score)
    
    if res_idx == 1
        selected_feature = filter(row -> !(row.B != score), dis_feature)
    elseif res_idx == 2
        selected_feature = filter(row -> !(row.C != score), dis_feature)
    elseif res_idx  == 3
        selected_feature = filter(row -> !(row.D != score), dis_feature)
    elseif res_idx == 4
        selected_feature = filter(row -> !(row.E != score), yie_feature)
    elseif res_idx == 5
        selected_feature = filter(row -> !(row.B != score), yie_feature)
    else
        selected_feature = filter(row -> !(row.F != score), yie_feature)
    end
    if score == 0 
        if res_idx == 1
            three_score = filter(row -> !(row.B != 3), dis_feature)
        elseif res_idx == 2
            three_score = filter(row -> !(row.C != 3), dis_feature)
        elseif res_idx  == 3
            three_score = filter(row -> !(row.D != 3), dis_feature)
        elseif res_idx == 4
            three_score = filter(row -> !(row.E != 3), yie_feature)
        elseif res_idx == 5
            three_score = filter(row -> !(row.B != 3), yie_feature)
        else
            three_score = filter(row -> !(row.F != 3), yie_feature)
        end
        three_num = size(three_score)[1]
        if three_num <= size(selected_feature)[1]
            feature_name = selected_feature[1:three_num, 1]
        else
            feature_name = selected_feature[:, 1]
        end
    else
        feature_name = selected_feature[:, 1]
    end
    id = otu[:, 1]
    otu = otu[:, feature_name]
    otu = hcat(id, otu)
    rename!(otu,:x1 => :Column1)
    
    
    # join the otus and responses by sample ID
    data = innerjoin(otu, res, on = :Column1)

    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(otu_idx, "_", res_idx)
    CSV.write("../processed-data/otu_data/non_augumented/$score/$level/full-data/$filename.csv", Tables.table(mat), header=false)
end

process (generic function with 1 method)

In [39]:
#= 
This function load the filtered otu and responses 
=#
function load_otu(level, score)
    # load important feature table
    dis_feature = DataFrame(XLSX.readtable("../processed-data/disease_response_important_features.xlsx"
            , "$level", "A:D", header=false))
    yie_feature = DataFrame(XLSX.readtable("../processed-data/yield_response_important_features.xlsx"
            , "$level", "A:F", header=false))
    
    # load all filtered data in one level folder
    otu_path = "../processed-data/otu_data/original/$level"
    otu_files = glob("*.csv", otu_path);
    otu = DataFrame.(CSV.File.(otu_files));
    
    # load all responses
    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    
    # pass them to process and write to new CSVs
    for i in 1:length(otu)
        for j in 1:length(response)
            process(otu[i], response[j], i, j, level, dis_feature, yie_feature, score)
        end
    end
end

load_otu (generic function with 1 method)

In [86]:
function alpha_process(alpha, res, alpha_idx, res_idx, level)
    alpha = alpha[:, Not(1)]
    rename!(alpha,:Link_ID => :Column1)
    
    # join the alphas and responses by sample ID
    data = innerjoin(alpha, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1:2)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(alpha_idx, "_", res_idx)
    CSV.write("../processed-data/alpha_index_data/non_augumented/$level/full-data/$filename.csv",
        Tables.table(mat), header=false)
end

alpha_process (generic function with 1 method)

In [87]:
function load_alpha(level)
    alpha_path = "../processed-data/alpha_index_data/original/$level"
    alpha_files = glob("*.csv", alpha_path);
    alpha = DataFrame.(CSV.File.(alpha_files));
    
    # load all responses
    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    
     # pass them to process and write to new CSVs
    for i in 1:length(alpha)
        for j in 1:length(response)
            alpha_process(alpha[i], response[j], i, j, level)
        end
    end
end

load_alpha (generic function with 1 method)

In [105]:
function other_process(file, res, pred_idx, res_idx, pred)
    file = file[completecases(file), :]
    file = file[:, Not(1)]
    rename!(file,:Link_ID => :Column1)
    
    # join the alphas and responses by sample ID
    data = innerjoin(file, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1:2)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/$pred/non_augumented//full-data/$filename.csv",
        Tables.table(mat), header=false)
end

other_process (generic function with 1 method)

In [106]:
function load_other(pred)
    path = "../processed-data/$pred/original"
    files = glob("*.csv", path);
    data = DataFrame.(CSV.File.(files));
    
    # load all responses
    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    
     # pass them to process and write to new CSVs
    for i in 1:length(data)
        for j in 1:length(response)
            other_process(data[i], response[j], i, j, pred)
        end
    end
end

load_other (generic function with 1 method)

In [40]:
all_level = ["Phylum", "Class", "Order", "Family", "Genus"]

# get all files for OTUs
for i in 1:length(all_level)
    for j in 0:3
        load_otu(all_level[i], j)
    end
end

In [88]:
# get all files for alpha diversity index
for i in 1:length(all_level)
    load_alpha(all_level[i])
end

In [113]:
load_other("soil_chemistry_data")
load_other("disease_suppression_data")

In [114]:
#a = CSV.read("../processed-data/disease_suppression_data/non_augumented/full-data/1_1.csv", DataFrame, header=true)

#a[completecases(a), :]

Unnamed: 0_level_0,0.20243566481418684,-0.05166937418938376,2.0805408871782096,1.0
Unnamed: 0_level_1,Float64,Float64,Float64,Float64
1,-0.133313,-0.276311,2.20249,1.0
2,0.31251,0.0744208,2.09905,1.0
3,-0.0680612,-0.469369,2.03198,1.0
4,-0.119547,0.322787,1.89412,1.0
5,-0.1481,0.105263,2.00765,1.0
6,0.802748,0.412761,2.3174,1.0
7,0.0841525,-0.0724293,2.59411,1.0
8,0.82726,0.0228143,2.29037,1.0
9,0.396765,0.0054676,1.92636,1.0
10,0.459371,0.133622,1.91127,1.0
