In [60]:
using CSV
using DataFrames
using XLSX
using Statistics
using Glob

In [68]:
#=
This function removes the unnecessary columns for the input of BNN
This function also writes the processed data into its respective CSV file
=#
function process(otu, res, res_idx, level, feature_sel, score)
    
    if res_idx == 1
        selected_feature = filter(row -> !(row.B != score), feature_sel)
    elseif res_idx == 2
        selected_feature = filter(row -> !(row.C != score), feature_sel)
    elseif res_idx  == 3
        selected_feature = filter(row -> !(row.D != score), feature_sel)
    elseif res_idx == 4
        selected_feature = filter(row -> !(row.E != score), feature_sel)
    elseif res_idx == 5
        selected_feature = filter(row -> !(row.F != score), feature_sel)
    else
        selected_feature = filter(row -> !(row.G != score), feature_sel)
    end
   
    feature_name = selected_feature[:, 1]
    ###################################################################################
    # WARNING: VERY WERID FEATURE NAME CHANGE, WILL REMOVE AFTER ADDRESSING THE ISSUE #
    ###################################################################################
    for i in 1:length(feature_name)
        if feature_name[i][1] == 'X' && (Int(feature_name[i][2]) in 46:57)
            feature_name[i] = replace(feature_name[i], "X" => "")
        end
    end
    ####################################################################################
    
    id = otu[:, 1]
    otu = otu[:, feature_name]
    otu = convert.(Float64, otu)
    otu = normalize(otu)
    otu = hcat(id, otu)
    rename!(otu,:x1 => :Column1)
    
    # join the otus and responses by sample ID
    data = innerjoin(otu, res, on = :Column1)

    # remove the sample ID
    data = data[:, Not(1)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(1, "_", res_idx)
    CSV.write("../processed-data/otu_data/non_augumented/$score/$level/full-data/$filename.csv", Tables.table(mat), header=false)

    return feature_name
end

process (generic function with 1 method)

In [66]:
function normalize(otu) 
    for i in 1:size(otu)[1]
        row_sum = sum(otu[i,:])
        for j in 1:size(otu)[2]
            convert(Float64, otu[i, j])
            otu[i,j] = otu[i,j] / row_sum
        end
    end
    return otu
end

normalize (generic function with 1 method)

In [24]:
function restruct_data(level)
    data = CSV.read("../raw-data/Y1_F_$level.csv", DataFrame)
    data = data[data[:,2] .!= "NA", :]
    nrow = size(data)[1]
    ###################################################################################
    # WARNING: VERY WERID FEATURE NAME CHANGE, WILL REMOVE AFTER ADDRESSING THE ISSUE #
    ###################################################################################
    for i in 1:nrow
        data[i,2] = replace(data[i,2], "-" => ".")
        data[i,2] = replace(data[i,2], " " => ".")
        data[i,2] = replace(data[i,2], "(" => ".")
        data[i,2] = replace(data[i,2], ")" => ".")
        data[i,2] = replace(data[i,2], "/" => ".")
        data[i,2] = replace(data[i,2], "[" => ".")
        data[i,2] = replace(data[i,2], "]" => ".")
    end
    ####################################################################################
    ncol = size(data)[2]
    df = data[:, 3:ncol]
    colnames = names(df)
    df[!, :id] = data[:,2]
    df1 = stack(df, colnames)
    df_new = unstack(df1, :variable, :id, :value)
    data = rename!(df_new, :variable => :Column1)
    data = data[completecases(data), :]
    return data
end

restruct_data (generic function with 1 method)

In [25]:
#= 
This function load the filtered otu and responses 
=#
function load_otu(level, score)
    # load important feature table
    feature_selection = DataFrame(XLSX.readtable("../processed-data/important_features_score.xlsx"
            , "$level", "A:G", header=false))
    
    # load raw OTU count data
    otu = restruct_data(level)
    
    # load all responses
    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    
    # pass them to process and write to new CSVs
  
    for j in 1:length(response)
        process(otu, response[j], j, level, feature_selection, score)
    end
end

load_otu (generic function with 1 method)

In [26]:
function alpha_process(alpha, res, alpha_idx, res_idx, level)
    alpha = alpha[:, Not(1)]
    rename!(alpha,:Link_ID => :Column1)
    
    # join the alphas and responses by sample ID
    data = innerjoin(alpha, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1:2)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(alpha_idx, "_", res_idx)
    CSV.write("../processed-data/alpha_index_data/non_augumented/$level/full-data/$filename.csv",
        Tables.table(mat), header=false)
end

alpha_process (generic function with 1 method)

In [27]:
function load_alpha(level)
    alpha_path = "../processed-data/alpha_index_data/original/$level"
    alpha_files = glob("*.csv", alpha_path);
    alpha = DataFrame.(CSV.File.(alpha_files));
    
    # load all responses
    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    
     # pass them to process and write to new CSVs
    for i in 1:length(alpha)
        for j in 1:length(response)
            alpha_process(alpha[i], response[j], i, j, level)
        end
    end
end

load_alpha (generic function with 1 method)

In [28]:
function other_process(file, res, pred_idx, res_idx, pred)
    file = file[completecases(file), :]
    file = file[:, Not(1)]
    rename!(file,:Link_ID => :Column1)
    
    # join the alphas and responses by sample ID
    data = innerjoin(file, res, on = :Column1)
    # remove the sample ID
    data = data[:, Not(1:2)]
    # write the data to a CSV file with its specified name
    mat = Matrix(data)
    filename = string(pred_idx, "_", res_idx)
    CSV.write("../processed-data/$pred/non_augumented//full-data/$filename.csv",
        Tables.table(mat), header=false)
end

other_process (generic function with 1 method)

In [29]:
function load_other(pred)
    path = "../processed-data/$pred/original"
    files = glob("*.csv", path);
    data = DataFrame.(CSV.File.(files));
    
    # load all responses
    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));
    
     # pass them to process and write to new CSVs
    for i in 1:length(data)
        for j in 1:length(response)
            other_process(data[i], response[j], i, j, pred)
        end
    end
end

load_other (generic function with 1 method)

In [69]:
all_level = ["Phylum", "Class", "Order", "Family", "Genus"]

# get all files for OTUs
for i in 1:length(all_level)
    for j in 1:3
        load_otu(all_level[i], j)
    end
end

└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208


In [118]:
a = CSV.read("../processed-data/otu_data/non_augumented/1/Order/full-data/Order_2.csv", DataFrame);

In [88]:
# get all files for alpha diversity index
for i in 1:length(all_level)
    load_alpha(all_level[i])
end

In [113]:
load_other("soil_chemistry_data")
load_other("disease_suppression_data")

In [41]:
a = 1
typeof(a)
a = a/3

0.3333333333333333