In [55]:
using CSV
using DataFrames
using XLSX
using Statistics
using Glob

# Generic Helper Function

In [56]:
function load_response(aug)
    response_path = "../processed-data/response/$aug/"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    return response
end

load_response (generic function with 1 method)

In [57]:
function load_generic_data(data, level)
    if level != "null"
        path = "../processed-data/$data/original/$level"
    else
        path = "../processed-data/$data/original"
    end
    files = glob("*.csv", path);
    data = DataFrame.(CSV.File.(files))
    return data
end

load_generic_data (generic function with 1 method)

In [58]:
function drop_miss(file) 
    file = file[completecases(file), :]
    file = file[:, Not(1)]
    rename!(file,:Link_ID => :Column1)
    return file
end

drop_miss (generic function with 1 method)

In [59]:
function select_otu(otu, res, res_idx, level, feature_sel, score)
    if res_idx == 1
        selected_feature = filter(row -> !(row.B != score), feature_sel)
    elseif res_idx == 2
        selected_feature = filter(row -> !(row.C != score), feature_sel)
    elseif res_idx  == 3
        selected_feature = filter(row -> !(row.D != score), feature_sel)
    elseif res_idx == 4
        selected_feature = filter(row -> !(row.E != score), feature_sel)
    elseif res_idx == 5
        selected_feature = filter(row -> !(row.F != score), feature_sel)
    else
        selected_feature = filter(row -> !(row.G != score), feature_sel)
    end
   
    if score == 0 
        if res_idx == 1
            three_score = filter(row -> !(row.B != 3), feature_sel)
        elseif res_idx == 2
            three_score = filter(row -> !(row.C != 3), feature_sel)
        elseif res_idx  == 3
            three_score = filter(row -> !(row.D != 3), feature_sel)
        elseif res_idx == 4
            three_score = filter(row -> !(row.E != 3), feature_sel)
        elseif res_idx == 5
            three_score = filter(row -> !(row.F != 3), feature_sel)
        else
            three_score = filter(row -> !(row.G != 3), feature_sel)
        end
        three_num = size(three_score)[1]
        if three_num <= size(selected_feature)[1]
            feature_name = selected_feature[1:three_num, 1]
        else
            feature_name = selected_feature[:, 1]
        end
    else
        feature_name = selected_feature[:, 1]
    end
    
    
    ###################################################################################
    # WARNING: VERY WERID FEATURE NAME CHANGE, WILL REMOVE AFTER ADDRESSING THE ISSUE #
    ###################################################################################
    for i in 1:length(feature_name)
        if feature_name[i][1] == 'X' && (Int(feature_name[i][2]) in 46:57)
            feature_name[i] = replace(feature_name[i], "X" => "")
        end
    end
    ####################################################################################
    
    id = otu[:, 1]
    otu = otu[:, feature_name]
    otu = convert.(Float64, otu)
    otu = normalize(otu)
    otu = hcat(id, otu)
    rename!(otu,:x1 => :Column1)
    return otu
end

select_otu (generic function with 1 method)

In [60]:
function normalize(otu) 
    for i in 1:size(otu)[1]
        row_sum = sum(otu[i,:])
        for j in 1:size(otu)[2]
            convert(Float64, otu[i, j])
            if row_sum == 0
                otu[i,j] = 0
            else
                otu[i,j] = otu[i,j] / row_sum
            end
        end
    end
    return otu
end

normalize (generic function with 1 method)

In [61]:
function restruct_data(level)
    data = CSV.read("../raw-data/Y1_F_$level.csv", DataFrame)
    data = data[data[:,2] .!= "NA", :]
    nrow = size(data)[1]
    ###################################################################################
    # WARNING: VERY WERID FEATURE NAME CHANGE, WILL REMOVE AFTER ADDRESSING THE ISSUE #
    ###################################################################################
    for i in 1:nrow
        data[i,2] = replace(data[i,2], "-" => ".")
        data[i,2] = replace(data[i,2], " " => ".")
        data[i,2] = replace(data[i,2], "(" => ".")
        data[i,2] = replace(data[i,2], ")" => ".")
        data[i,2] = replace(data[i,2], "/" => ".")
        data[i,2] = replace(data[i,2], "[" => ".")
        data[i,2] = replace(data[i,2], "]" => ".")
    end
    ####################################################################################
    ncol = size(data)[2]
    df = data[:, 3:ncol]
    colnames = names(df)
    df[!, :id] = data[:,2]
    df1 = stack(df, colnames)
    df_new = unstack(df1, :variable, :id, :value, allowduplicates=true)
    data = rename!(df_new, :variable => :Column1)
    data = data[completecases(data), :]
    return data
end

restruct_data (generic function with 1 method)

In [62]:
function aug_norm(otu)
    id = otu[:, 1]
    otu = otu[:, Not(1)]
    otu = convert.(Float64, otu)
    otu = normalize(otu)
    otu = hcat(id, otu)
    rename!(otu,:x1 => :Column1)
    return otu
end

aug_norm (generic function with 1 method)

# Augmented OTU

In [63]:
function load_augment_otu(level, response, res_idx)
    otu_path = "../processed-data/all_otu_augmented/full-data/$level"
    otu_files = glob("*.csv", otu_path)
    otu = DataFrame.(CSV.File.(otu_files));
    
    response = CSV.read("../processed-data/response/augmented/$response.csv", DataFrame)
    for i in 1:length(otu)
        process_augment_otu(level, otu[i], response, i, res_idx)
    end
end

load_augment_otu (generic function with 1 method)

In [64]:
function process_augment_otu(level, otu, res, norm_idx, res_idx)
    train_res = res[1:800, :]
    test_res = res[800:size(res)[1], :]
    
    train_data = innerjoin(otu, train_res, on = :Column1)
    test_data = innerjoin(otu, test_res, on = :Column1)
    
    # remove the sample ID
    train_data = train_data[:, Not(1)]
    test_data = test_data[:, Not(1)]
    
    # write the data to a CSV file with its specified name
    train_mat = Matrix(train_data)
    test_data = Matrix(test_data)
    
    filename1 = string(norm_idx, "_", res_idx, "_", "train")
    filename2 = string(norm_idx, "_", res_idx, "_", "test")
    CSV.write("../processed-data/all_otu_augmented/train-test-split/$level/$filename1.csv", Tables.table(train_mat), header=false)
    CSV.write("../processed-data/all_otu_augmented/train-test-split/$level/$filename2.csv", Tables.table(test_data), header=false)
end

process_augment_otu (generic function with 1 method)

# selected OTU
Read all raw OTU counts -> restruct it -> select features by Rosa's selection table -> normalize by row sum.

In [65]:
function process(otu, res, res_idx, level, feature_sel, score)
    otu = select_otu(otu, res, res_idx, level, feature_sel, score)

    # join the otus and responses by sample ID
    data = innerjoin(otu, res, on = :Column1)

    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(1, "_", res_idx)
    CSV.write("../processed-data/otu_selection/$score/$level/full-data/$filename.csv", Tables.table(mat), header=false)
end

process (generic function with 1 method)

In [66]:
function load_otu(level, score)
    # load important feature table
    feature_selection = DataFrame(XLSX.readtable("../processed-data/otu_selection/important_features_score.xlsx"
            , "$level", "A:G", header=false))
    
    # load raw OTU count data
    otu = restruct_data(level)
    
    # load all responses
    response = load_response("non_augmented")
    
    # pass them to process and write to new CSVs
  
    for j in 1:length(response)
        process(otu, response[j], j, level, feature_selection, score)
    end
end

load_otu (generic function with 1 method)

# All OTU -> non-augmented
load all filtered normalizaed OTU by Rosa -> join with response -> save

In [67]:
function load_all_OTU(level, aug)
    if aug == "augmented"
        otu_path = "../processed-data/all_otu_augmented/original/$level"
    else
        otu_path = "../processed-data/all_otu_non_augmented/original/$level"
    end
    
    otu_files = glob("*.csv", otu_path)
    otu = DataFrame.(CSV.File.(otu_files))

    response = load_response(aug)

    for i in 1:length(otu)
        for j in 1:length(response)
            process_all_otu(otu[i], response[j], i, j, level, aug)
        end
    end
end

load_all_OTU (generic function with 1 method)

In [68]:
function process_all_otu(otu, res, otu_idx, res_idx, level, aug)
    # join the otus and responses by sample ID
    data = innerjoin(otu, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(otu_idx, "_", res_idx)
    if aug == "augmented"
        CSV.write("../processed-data/all_otu_augmented/$level/full-data/$filename.csv", Tables.table(mat), header=false)
    else
        CSV.write("../processed-data/all_otu_non_augmented/$level/full-data/$filename.csv", Tables.table(mat), header=false)
    end
end

process_all_otu (generic function with 1 method)

# All alpha diversity indices
load all 5 levels of alpha diversity indices with 7 scaling methods -> join with response -> write to CSV

In [69]:
function alpha_process(alpha, res, alpha_idx, res_idx, level)
    alpha = drop_miss(alpha)
    
    # join the alphas and responses by sample ID
    data = innerjoin(alpha, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(alpha_idx, "_", res_idx)
    CSV.write("../processed-data/alpha_index_data/$level/full-data/$filename.csv",
        Tables.table(mat), header=false)
end

alpha_process (generic function with 1 method)

In [70]:
function load_alpha(level)
    alpha = load_generic_data("alpha_index_data", level)
    
    # load all responses
    response = load_response("non_augmented")
    
     # pass them to process and write to new CSVs
    for i in 1:length(alpha)
        for j in 1:length(response)
            alpha_process(alpha[i], response[j], i, j, level)
        end
    end
end

load_alpha (generic function with 1 method)

# Soil Chemistry & disease suppression
load soil chemistry and disease suppression with 6 scaling methods -> join with responses -> write to CSV

In [71]:
function other_process(file, res, pred_idx, res_idx, pred)
    file = drop_miss(file)
    
    # join the alphas and responses by sample ID
    data = innerjoin(file, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/$pred/full-data/$filename.csv",
        Tables.table(mat), header=false)
end

other_process (generic function with 1 method)

In [72]:
function load_other(pred)
    data = load_generic_data(pred, "null")
    
    # load all responses
    response = load_response("non_augmented")
    
     # pass them to process and write to new CSVs
    for i in 1:length(data)
        for j in 1:length(response)
            other_process(data[i], response[j], i, j, pred)
        end
    end
end

load_other (generic function with 1 method)

# Soil Chemistry + Disease Suppression

In [73]:
function load_soil_disease()
    data1 = load_generic_data("soil_chemistry_data", "null")
    data2 = load_generic_data("disease_suppression_data", "null")
    
    response = load_response("non_augmented")
    
     # pass them to process and write to new CSVs
    for i in 1:length(data1)
        for j in 1:length(response)
            process_soil_disease(data1[i], data2[i], response[j], i, j)
        end
    end
end

load_soil_disease (generic function with 1 method)

In [74]:
function process_soil_disease(file1, file2, res, pred_idx, res_idx)
    file1 = drop_miss(file1)
    file2 = drop_miss(file2)
    
    # join the alphas and responses by sample ID
    data = innerjoin(file1, file2, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/soil_disease/full-data/$filename.csv",
        Tables.table(mat), header=false)
end

process_soil_disease (generic function with 1 method)

# Alpha Diversity Indices + Soil Chemistry

In [75]:
function load_alpha_soil(level)
    alpha = load_generic_data("alpha_index_data", level);
    data = load_generic_data("soil_chemistry_data", "null")
    
    # load all responses
    response = load_response("non_augmented")
    
     # pass them to process and write to new CSVs
    for i in 1:length(data)
        for j in 1:length(response)
            process_alpha_soil(alpha[i],data[i], response[j], i, j, level)
        end
    end
end

load_alpha_soil (generic function with 1 method)

In [76]:
function process_alpha_soil(alpha, file, res, pred_idx, res_idx, level)
    alpha = drop_miss(alpha)
    file = drop_miss(file)
    
    data = innerjoin(alpha, file, res, on = :Column1)
    data = data[:, Not(1)]
   
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/alpha_soil/$level/full-data/$filename.csv",
        Tables.table(mat), header=false)

end

process_alpha_soil (generic function with 1 method)

# Alpha Diversity Indices + Soil Chemistry + Disease Suppression

In [77]:
function load_alpha_soil_disease(level)
    alpha = load_generic_data("alpha_index_data", level);
    data1 = load_generic_data("soil_chemistry_data", "null")
    data2 = load_generic_data("disease_suppression_data", "null")
    
    # load all responses
    response = load_response("non_augmented")
    
     # pass them to process and write to new CSVs
    for i in 1:length(data1)
        for j in 1:length(response)
            process_alpha_soil_disease(alpha[i],data1[i], data2[i], response[j], i, j, level)
        end
    end
end

load_alpha_soil_disease (generic function with 1 method)

In [78]:
function process_alpha_soil_disease(alpha, file1, file2, res, pred_idx, res_idx, level)
    alpha = drop_miss(alpha)
    file1 = drop_miss(file1)
    file2 = drop_miss(file2)
    
    data = innerjoin(alpha, file1, file2, res, on = :Column1)
    data = data[:, Not(1)]
   
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/alpha_soil_disease/$level/full-data/$filename.csv",
        Tables.table(mat), header=false)

end

process_alpha_soil_disease (generic function with 1 method)

# OTU-Score=3 + Soil & OTU-Score=3 + disease

In [79]:
function load_otu_other(data_name, level)
    # load important feature table
    feature_selection = DataFrame(XLSX.readtable("../processed-data/otu_selection/important_features_score.xlsx"
            , "$level", "A:G", header=false))
    
    # load raw OTU count data
    otu = restruct_data(level)
    data = load_generic_data(data_name, "null")
    
    # load all responses
    response = load_response("non_augmented")
    
    # pass them to process and write to new CSVs
    for i in 1:length(data)
        for j in 1:length(response)
            otu_other_process(otu, data[i], response[j], i, j, level, feature_selection, data_name)
        end
    end
end

load_otu_other (generic function with 1 method)

In [80]:
function otu_other_process(otu, file, res, pred_idx, res_idx, level, feature_sel, dataName)
    otu = select_otu(otu, res, res_idx, level, feature_sel, 3)
    file = drop_miss(file)
    
    # join the otus and responses by sample ID
    data = innerjoin(otu, file, res, on = :Column1)

    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    if dataName == "soil_chemistry_data"
        CSV.write("../processed-data/otu_soil/$level/full-data/$filename.csv", Tables.table(mat), header=false)
    else
        CSV.write("../processed-data/otu_disease/$level/full-data/$filename.csv", Tables.table(mat), header=false)
    end
end

otu_other_process (generic function with 1 method)

# OTU-Score=3 + Soil + Disease

In [81]:
function load_otu_soil_disease(level)
    # load important feature table
    feature_selection = DataFrame(XLSX.readtable("../processed-data/otu_selection/important_features_score.xlsx"
            , "$level", "A:G", header=false))
    
    # load raw OTU count data
    otu = restruct_data(level)
    data1 = load_generic_data("soil_chemistry_data", "null")
    data2 = load_generic_data("disease_suppression_data", "null")
    
    # load all responses
    response = load_response("non_augmented")
    
    # pass them to process and write to new CSVs
    for i in 1:length(data1)
        for j in 1:length(response)
            combined_process(otu, data1[i], data2[i], response[j], i, j, level, feature_selection)
        end
    end
end

load_otu_soil_disease (generic function with 1 method)

In [82]:
function combined_process(otu, file1, file2, res, pred_idx, res_idx, level, feature_sel)
    otu = select_otu(otu, res, res_idx, level, feature_sel, 3)
    file1 = drop_miss(file1)
    file2 = drop_miss(file2)
    
    # join the otus and responses by sample ID
    data = innerjoin(otu, file1, file2, res, on = :Column1)

    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/otu_soil_disease/$level/full-data/$filename.csv", Tables.table(mat), header=false)
end

combined_process (generic function with 1 method)

In [83]:
all_level = ["Phylum", "Class", "Order", "Family", "Genus"];

# All loading function calls:

In [84]:
########################################################################
#                         Load OTU selection                           #
########################################################################


# get all files for OTUs
for i in 1:length(all_level)
    for j in 0:3
        load_otu(all_level[i], j)
    end
end

In [35]:
########################################################################
#                            Load all OTU                              #
########################################################################
for i in 1:length(all_level)
    load_all_OTU(all_level[i], "non_augmented")
end

In [51]:
########################################################################
#                      Load Alpha Diversity Indices                    #
########################################################################
for i in 1:length(all_level)
    load_alpha(all_level[i])
end

In [52]:
########################################################################
#             Load Soil chemistry and disease suppression              #
########################################################################
load_other("soil_chemistry_data")
load_other("disease_suppression_data")

In [19]:
########################################################################
#                    Load Full OTU for augmented                       #
########################################################################
all_level = ["Phylum", "Class", "Order", "Family"]
all_res = ["no_tuber_scab", "no_tuber_scabpit", "no_tuber_scabsuper", "yield_per_meter", "yield_per_plant", "pctg_black_scurf"]
# get all files for OTUs

for i in 1:length(all_level)
    for j in 1:length(all_res)
        load_augment_otu(all_level[i], all_res[j], j)
    end
end

In [54]:
########################################################################
#                         Load combinations                            #
########################################################################
all_level = ["Phylum", "Class", "Order", "Family", "Genus"]
# soil+disease
load_soil_disease()
# alpha + soil
for i in 1:length(all_level)
    load_alpha_soil(all_level[i])
    load_alpha_soil_disease(all_level[i])
    load_otu_soil_disease(all_level[i])
    load_otu_other("soil_chemistry_data", all_level[i])
    load_otu_other("disease_suppression_data", all_level[i])
end

# Testing Zone