In [18]:
using CSV
using MLBase
using Distances
using Clustering
using DataFrames
using StatsBase
using Hungarian
using LinearAlgebra
using JLD
using SparseArrays

In [2]:
function ground_true(df_1, df_2)
    a = fill(1,size(df_1)[1])
    b = fill(2,size(df_2)[1])
    gt = cat(a,b, dims = 1)
    return gt
end

# convert tree from dataframe to matrix. Each column is a tree
function standardize_tree(tree)    
    data = collect(Matrix(tree)');
    
    # standardize tree
    dt = fit(ZScoreTransform, data, dims=2)
    data = StatsBase.transform(dt, data)
    
    # replace NaN value with 0
    replace!(data, NaN=>0)
    return data
end

function accuracy(n, gt, pred)
    matrix = confusmat(n, gt, pred)
    # Hungarian algorithm minimizes the cost, so we need to transform the matrix
    A = -matrix .+ maximum(matrix)    
    matrix = matrix[:,hungarian(A)[1]]
    x = tr(matrix)/sum(matrix)
    return matrix, x
end

accuracy (generic function with 1 method)

In [15]:
tree1 = CSV.read("data/4_diff_topo_1_100_1.csv", DataFrame);
tree2 = CSV.read("data/4_diff_topo_2_100_1.csv", DataFrame);
tree1 = Matrix(tree1);
tree2 = Matrix(tree2);

In [4]:
gt = ground_true(tree1, tree2);
tree = vcat(tree1,tree2)
tree = standardize_tree(tree)

7×200 Matrix{Float64}:
 -0.412284   2.23811    0.152577  …  -0.291972  -0.838893   1.12883
  0.436509  -1.00641    0.225555     -0.17223   -0.435042  -0.412536
  0.793536   1.29313   -0.145328     -0.427791  -0.317066  -0.927798
  0.722671   0.725172  -0.485246     -0.36146   -0.598092  -0.577828
  0.361361  -0.623037   1.13121      -0.623037  -0.623037  -0.623037
 -0.598079  -0.196737  -0.598079  …   0.538041   0.848036   0.270425
 -0.201355  -0.201355  -0.201355     -0.201355  -0.201355  -0.201355

In [5]:
matrix = pairwise(Euclidean(), tree, dims=2)

200×200 Matrix{Float64}:
 0.0      3.23826  1.8157    2.57955   …  2.30474   2.64192   3.07791
 3.23826  0.0      3.55527   5.10812      3.43211   3.90253   2.90375
 1.8157   3.55527  0.0       1.77073      2.19525   2.57493   2.41096
 2.57955  5.10812  1.77073   0.0          3.16256   3.38353   3.735
 2.32548  1.30908  2.45542   4.08519      2.47856   2.99054   2.07981
 4.19538  1.33259  4.2447    5.76542   …  4.3535    4.916     3.42143
 2.98824  5.52398  2.35226   0.672688     3.58117   3.81592   4.13926
 1.89659  3.42187  0.714748  1.84142      2.57545   3.0441    2.58731
 1.41331  2.55693  1.70892   2.78886      2.68344   3.23002   2.72735
 2.2727   4.87136  1.51609   0.464822     2.80543   3.00091   3.47937
 1.72904  2.76678  1.99719   3.42776   …  1.73851   1.9914    2.2832
 1.23885  4.12904  1.7001    1.73411      2.96424   3.23254   3.68644
 2.32074  3.52365  1.101     2.5933       1.93061   2.18417   2.08522
 ⋮                                     ⋱                      
 2.24

In [6]:
function hc_label(matrix; linkage=:single)
    H = hclust(matrix,linkage = linkage)
    return pred = cutree(H, k=2)
end

hc_label (generic function with 1 method)

In [7]:
pred = hc_label(matrix)
accuracy(2, gt, pred)

([1 99; 0 100], 0.505)

In [8]:
function hc_matrix(trees, path)
    n = length(trees)
    result = zeros(n, n)
    for i in 2:n
        for j in  1:(i - 1)
            gt = ground_true(trees[i],trees[j])
            tree = vcat(trees[i],trees[j])
            tree = standardize_tree(tree)
            matrix = pairwise(Euclidean(), tree, dims=2)
            pred = hc_label(matrix)
            m,x = accuracy(2, gt, pred)
            result[i,j] = x
            result[j,i] = x
        end
    end      
    header = Vector(1:n)
    header = string.(header)
    CSV.write(path, DataFrame(result, :auto),header = header);
end

hc_matrix (generic function with 1 method)

In [9]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "data/4_diff_topo_" * string(j) *  "_50_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end

    hc_matrix(trees, "data/hc/hc_4_diff_topo_50_" * string(i) * ".csv");
end


In [31]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "data/4_diff_topo_" * string(j) *  "_100_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end

    hc_matrix(trees, "data/hc/hc_4_diff_topo_100_" * string(i) * ".csv");
end


In [None]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "data/4_diff_topo_" * string(j) *  "_500_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end

    hc_matrix(trees, "data/hc/hc_4_diff_topo_500_" * string(i) * ".csv");
end

In [10]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "data/4_diff_topo_" * string(j) *  "_1000_" * string(i) * ".csv"
        tree = CSV.read(path, DataFrame);
        push!(trees, tree)
    end

    hc_matrix(trees, "data/hc/hc_4_diff_topo_1000_" * string(i) * ".csv");
end

In [19]:
for i in 1:100
    trees = []
    for j in 1:15
        path = "data/8_diff_topo_" * string(j) *  "_50_" * string(i) * ".jld"
        tree = jldopen(path, "r") do file
            read(file, "tree")
        end
        push!(trees, tree)
    end

    hc_matrix(trees, "data/hc/hc_8_diff_topo_50_" * string(i) * ".csv");
end
