In [335]:
using CSV
using DataFrames
using XLSX
using Statistics
using Distributions
using Random
using Tables
using Glob

In [336]:
function restruct_data(level)
    data = CSV.read("../raw-data/Y1_F_$level.csv", DataFrame)
    data = data[data[:,2] .!= "NA", :]
    ncol = size(data)[2]
    df = data[:, 3:ncol]
    colnames = names(df)
    df[!, :id] = data[:,2]
    df1 = stack(df, colnames)
    df_new = unstack(df1, :variable, :id, :value)
    data = rename!(df_new, :variable => :Column1)
    data = data[completecases(data), :]
    return data
end

restruct_data (generic function with 1 method)

In [337]:
function split_variety(full_label, unique_name, var_idx)
    sub_arr = Any[]
    for i in 1:length(unique_name)
        new_sub = full_label[full_label[:, var_idx] .== unique_name[i], :]
        push!(sub_arr, new_sub)
    end
    return sub_arr
end

split_variety (generic function with 1 method)

In [338]:
function push_otu(otu_data, select, id, rd)
    dat = innerjoin(otu_data, select, on = :Column1)
    dat = disallowmissing!(dat)
    gen = generate(dat, id, rd)
    new_otu = gen[1:length(gen) - 2]
    push!(otu_data, new_otu)
end

push_otu (generic function with 2 methods)

In [339]:
function push_all_otu(otu, select, id)
    dat = innerjoin(otu[1], select, on = :Column1)
    dat = disallowmissing!(dat)
    rd = rand(1:size(dat)[1])
    orig_id = dat[rd, 1]
    for i in 1: length(otu)
        push_otu(otu[i], select, id, rd)
    end
    return orig_id
end

push_all_otu (generic function with 1 method)

In [340]:
function generate(data, id, rd)
    new_gen = Any[]
    original = data[rd, :]
    push!(new_gen, string(id))
    for i in 2:size(data)[2]-2
        m = mean(data[:, i])/1000
        v = Statistics.var(data[:, i])/1000
        
        if v <= 0 || size(data)[1] == 1
            v = 1
        end
        noise = rand(Normal(m,v))
        if original[i] == 0
            append!(new_gen, 0)
        elseif original[i] + noise < 0
            append!(new_gen, 0)
        else
            append!(new_gen, round(original[i] + noise))
        end
    end
    append!(new_gen, data[1, size(data)[2]-1])
    push!(new_gen, data[1, size(data)[2]])
    return new_gen
end

generate (generic function with 2 methods)

In [341]:
function push_res(res, gen_res, orig_id, id)
    selected_id = filter(row -> (row.Column1 == orig_id), res)
    if size(selected_id)[1] == 0
        return
    end
    label = selected_id[1,2]
    new_res = Any[]
    push!(new_res, string(id))
    push!(new_res, label)
    push!(gen_res, new_res)
end

push_res (generic function with 2 methods)

In [342]:
function push_all_res(all_res, all_gen_res, orig_id, id)
    for i in 1:length(all_res)
        push_res(all_res[i], all_gen_res[i], orig_id, id)
    end
end

push_all_res (generic function with 1 method)

In [343]:
function update(otu, all_res, all_gen_res, var_sub, id)
    choose = rand(1:length(var_sub))
    select = var_sub[choose]
    label = select[1,2]
    orig_id = push_all_otu(otu, select, id)
    push_all_res(all_res, all_gen_res, orig_id, id)
end

update (generic function with 3 methods)

In [344]:
function add_this_label(num_label, all_res, all_gen_res, id, otu, var_sub) 
    for i in 1:num_label
        update(otu, all_res, all_gen_res, var_sub, id)
        id += 1
    end
    return id
end

add_this_label (generic function with 2 methods)

In [345]:
function do_one_response(response, gen_response, var, id, idx)
    res = response[idx]
    new_res = gen_response[idx]
    res_var = innerjoin(res, var, on = :Column1)

    #split into label and var
    var_idx = size(res_var)[2]
    label_idx = var_idx - 1
    low = res_var[res_var[:,label_idx] .== 0, :]
    high = res_var[res_var[:,label_idx] .== 1, :]
    low_name = unique(low[:, var_idx])
    low_sub = split_variety(low, low_name, var_idx)
    high_name = unique(high[:, var_idx])
    high_sub = split_variety(high, high_name, var_idx)

    pert = [(i, count(==(i), new_res[:,2])) for i in unique(new_res[:,2])]
    label = pert[1][1]
    if label == 0
        num_zero = 500 - pert[1][2]
        if num_zero > 0
            id = add_this_label(num_zero, response, gen_response, id, otu, low_sub)
        end
        num_one = 500 - pert[2][2]
        if num_one > 0
            id = add_this_label(num_one, response, gen_response, id, otu, high_sub)
        end
    else
        num_zero = 500 - pert[2][2]
        if num_zero > 0
            id = add_this_label(num_zero, response, gen_response, id, otu, low_sub)
        end
        num_one = 500 - pert[1][2]
        if num_one > 0
            id = add_this_label(num_one, response, gen_response, id, otu, high_sub)
        end  
    end
    return id
end

do_one_response (generic function with 2 methods)

In [346]:
function remove_label(gen_res)
    pert = [(i, count(==(i), gen_res[:,2])) for i in unique(gen_res[:,2])]
    label = pert[1][1]
    if label == 0
        rm_zero = pert[1][2] - 500
        if rm_zero > 0
            gen_res = remove(gen_res, 0, rm_zero)
        end
        rm_one = pert[2][2] - 500
        if rm_one > 0
            gen_res = remove(gen_res, 1, rm_one)
        end
    else
        rm_zero = pert[2][2] - 500
        if rm_zero > 0
            gen_res = remove(gen_res, 0, rm_zero)
        end
        rm_one = pert[1][2] -500
        if rm_one > 0
            gen_res = remove(gen_res, 1, rm_one)
        end  
    end
    return gen_res
end

remove_label (generic function with 1 method)

In [347]:
function remove(gen_res, label, num)
    count = 0;
    idx = size(gen_res)[1]
    while (count != num)
        #println("ct:", count, ", num:", num, ", idx:", idx)
        if (gen_res[idx, 2] == label)
            gen_res = gen_res[Not(idx), :]
            count += 1
        end
        idx -= 1
    end
    return gen_res
end

remove (generic function with 1 method)

In [348]:
function write_otu(otu_data, level)
    CSV.write("../processed-data/otu_data/augumented_otu_count/$level.csv", otu_data)
end

write_otu (generic function with 1 method)

In [349]:
function write_response(res_data, response)
    CSV.write("../processed-data/response/augumented_response/$response.csv", res_data)
end

write_response (generic function with 1 method)

In [352]:
# reconstruct OTU data first and save them in OTU array
all_level = ["Phylum", "Class", "Order","Family", "Genus"]
all_response = ["no_tuber_scab", "no_tuber_scabpit", "no_tuber_scabsuper", "yield_per_meter", "yield_per_plant", "pctg_black_scurf"]
otu = Any[]
for i in 1:length(all_level)
    level = all_level[i]   
    one_level = restruct_data(level)
    push!(otu, one_level)
end    

# save all responses into the response array
response_path = "../processed-data/response"
response_files = glob("*.csv", response_path)
response = DataFrame.(CSV.File.(response_files));

# get the variety2 entries
variety = CSV.read("../raw-data/metadata.csv", DataFrame)
variety = variety[:, Not(2:11)]
variety = variety[:, 1:2]
variety = variety[completecases(variety), :]
variety = filter(row -> !(row.Variety2 == "NA"), variety);

gen_response = deepcopy(response);

id = 1
for i in 1:length(response)
    id = do_one_response(response, gen_response, variety, id, i)
end
# now remove excessive response
for i in 1:length(gen_response)
    gen_response[i] = remove_label(gen_response[i])
end

for i in 1:length(all_level)
    write_otu(otu[i], all_level[i])
end
for i in 1:length(all_response)
    write_response(gen_response[i], all_response[i])
end


└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
└ @ DataFrames C:\Users\Administrator\.julia\packages\DataFrames\GtZ1l\src\abstractdataframe\reshape.jl:208
