In [1]:
using CSV
using DataFrames
using XLSX
using Statistics
using Distributions
using Random
using Tables
using Glob

In [2]:
function restruct_data(level)
    data = CSV.read("../raw-data/Y1_F_$level.csv", DataFrame)
    data = data[data[:,2] .!= "NA", :]
    ncol = size(data)[2]
    df = data[:, 3:ncol]
    colnames = names(df)
    df[!, :id] = data[:,2]
    df1 = stack(df, colnames)
    df_new = unstack(df1, :variable, :id, :value)
    data = rename!(df_new, :variable => :Column1)
    data = data[completecases(data), :]
    return data
end

restruct_data (generic function with 1 method)

In [3]:
function split_variety(full_label, unique_name, var_idx)
    sub_arr = Any[]
    for i in 1:length(unique_name)
        new_sub = full_label[full_label[:, var_idx] .== unique_name[i], :]
        push!(sub_arr, new_sub)
    end
    return sub_arr
end

split_variety (generic function with 1 method)

In [9]:
function write(otu, res, var, level, resid)
    data = innerjoin(otu, res, var, on = :Column1);
    var_idx = size(data)[2]
    label_idx = var_idx - 1
    low = data[data[:,label_idx] .== 0, :]
    high = data[data[:,label_idx] .== 1, :]

    low_name = unique(low[:, var_idx])
    low_sub = split_variety(low, low_name, var_idx)
    high_name = unique(high[:, var_idx])
    high_sub = split_variety(high, high_name, var_idx)

    num_low_gen = 500 - size(low)[1]
    num_high_gen = 500 - size(high)[1]
    
    add_gen(low_sub, num_low_gen, data)
    add_gen(high_sub, num_high_gen, data)
    data = data[:, Not(var_idx)]
    CSV.write("../processed-data/augumented_otu_count/$level-$resid.csv", data)
end

write (generic function with 1 method)

In [5]:
function generate(data)
    new_gen = Any[]
    rd = rand(1:size(data)[1])
    original = data[rd, :]
    push!(new_gen, "BULI_BULI_114514")
    for i in 2:size(data)[2]-2
        m = mean(data[:, i])/1000
        v = var(data[:, i])/1000
        
        if v <= 0 || size(data)[1] == 1
            v = 1
        end
        noise = rand(Normal(m,v))
        if original[i] == 0
            append!(new_gen, 0)
        elseif original[i] + noise < 0
            append!(new_gen, 0)
        else
            append!(new_gen, round(original[i] + noise))
        end
    end
    append!(new_gen, data[1, size(data)[2]-1])
    push!(new_gen, data[1, size(data)[2]])
    return new_gen
end

generate (generic function with 1 method)

In [6]:
function add_gen(var_sub, num_gen, data)
    for i in 1:num_gen
        choose = rand(1:length(var_sub))
        new_gen = generate(var_sub[choose])
        push!(data, new_gen)
    end
end

add_gen (generic function with 1 method)

In [10]:
function main()
    all_level = ["Phylum", "Class", "Order","Family", "Genus"]
    full_data = Any[]
    for i in 1:length(all_level)
        level = all_level[i]   
        one_level = restruct_data(level)
        push!(full_data, one_level)
    end    

    response_path = "../processed-data/response"
    response_files = glob("*.csv", response_path)
    response = DataFrame.(CSV.File.(response_files));

    variety = CSV.read("../raw-data/metadata.csv", DataFrame)
    variety = variety[:, Not(2:11)]
    variety = variety[:, 1:2]
    variety = variety[completecases(variety), :]
    variety = filter(row -> !(row.Variety2 == "NA"), variety);

    for i in 1:length(full_data)
        for j in 1:length(response)
            write(full_data[i], response[j], variety, all_level[i], j)
        end
    end
    
end

main (generic function with 1 method)

In [11]:
main()

└ @ DataFrames /home/send_fuze/.julia/packages/DataFrames/GtZ1l/src/abstractdataframe/reshape.jl:208
└ @ DataFrames /home/send_fuze/.julia/packages/DataFrames/GtZ1l/src/abstractdataframe/reshape.jl:208
