In [1]:
using DataFrames
using Statistics
using Random


In [None]:
function df2fasta(df, id, seq, path)
    open(path, "a") do io
        for (id, seq) = zip(df[!,id], df[!,seq])
            write(io, ">$id\n$seq\n")
        end
    end
end

function fasta2df(input::String)
    df = DataFrame([[],[]], ["id", "seq"])

    id = ""
    sequence = ""

    for line in eachline(open(input))
        if startswith(line, ">") # is header
            if !isempty(sequence)
                push!(df, [id, sequence])
                sequence=""
            end
            id = string(split(line, ">")[2])
        else    # header
            sequence  = sequence * line           
        end
    end
    push!(df, [id, sequence])
    return df
end

function structure2fs3di(input::String, output::String = "tmp", keep_3di::Bool=false)

    run(`../bin/foldseek structureto3didescriptor -v 0 $input $output`)
    df = CSV.read(output, DataFrame, delim="\t", header=["id","seqaa", "seq3di", "coords"])
    if !keep_3di
        rm.([output, "$output.dbtype"], force=true)
    end

    return df
end

function shuffle_string(s) 
    return String(shuffle(collect(s)))
end

function segmasker(input)
    run(`../bin/segmasker -in $input -out tmp -infmt fasta -outfmt fasta`)
    df = fasta2df("tmp")
    rm("tmp", force=true)
    return df
end



In [93]:
df = structure2fs3di("../../scop40PDB", "fs.3di", true)
select!(df, Not([:coords]));
transform!(df, :id => ByRow(s -> s[1:7]) => :id)
print()

In [None]:
df[!, :shufl_seqaa] = map(shuffle_string, df.seqaa)
df[!, :shufl_seq3di] = map(shuffle_string, df.seq3di)
first(df, 3)

In [98]:
open("scop40_segmask.tsv", "a") do io
write(io, "data\tmasked_total\tmasked_relative\n")

for col in ["seqaa", "seq3di", "shufl_seqaa", "shufl_seq3di"]
    
    df2fasta(df, "id", col, "scop40.$col.fasta")
    tmpdf = segmasker("scop40.$col.fasta")
    
    count = 0
    megaseq = join(tmpdf.seq)

    for c in megaseq
        if islowercase(c)
            count+=1
        end
    end 

    write(io, "$col\t$count\t$(count/length(megaseq))\n")
    
end
end