# Transcriptomics Scratch Pad

In [101]:
using DataFrames, CSV, Statistics, CairoMakie

include("/Users/fomo/Documents/Kaizen/code/bioinformatics/modules/Plotify.jl")
include("/Users/fomo/Documents/Kaizen/code/bioinformatics/modules/GeoPreprocess.jl")

nothing

## Preprocessing

In [100]:
master = "../data/GEO"
geoID = "GSE64375"
meta = CSV.read(joinpath(master, geoID, "metadata.csv"), DataFrame)

first(meta, 5)

Row,GSM_ID,Title,Source,Characteristics
Unnamed: 0_level_1,String15,String31,String,String
1,GSM1569806,Blood-0.0Gy-1hr-rep1,"blood, isolated leukocytes, 0.0 Gy radiation, 1hr post-exposure","volunteer: 1; radiation dose: 0.0 Gy; time: 1 hr; tissue: blood, isolated leukocytes"
2,GSM1569807,Blood-0.0Gy-1hr-rep2,"blood, isolated leukocytes, 0.0 Gy radiation, 1hr post-exposure","volunteer: 2; radiation dose: 0.0 Gy; time: 1 hr; tissue: blood, isolated leukocytes"
3,GSM1569808,Blood-0.0Gy-1hr-rep3,"blood, isolated leukocytes, 0.0 Gy radiation, 1hr post-exposure","volunteer: 3; radiation dose: 0.0 Gy; time: 1 hr; tissue: blood, isolated leukocytes"
4,GSM1569809,Blood-0.0Gy-1hr-rep4,"blood, isolated leukocytes, 0.0 Gy radiation, 1hr post-exposure","volunteer: 4; radiation dose: 0.0 Gy; time: 1 hr; tissue: blood, isolated leukocytes"
5,GSM1569810,Blood-0.3Gy-1hr-rep1,"blood, isolated leukocytes, 0.3 Gy radiation, 1hr post-exposure","volunteer: 1; radiation dose: 0.3 Gy; time: 1 hr; tissue: blood, isolated leukocytes"


In [107]:
function parse_GSE64375(df::DataFrame)
    # Prepare a Vector of Dicts, one per row
    rows = Vector{Dict{Symbol,Any}}(undef, nrow(df))
    
    for (i, row) in enumerate(eachrow(df))
        # Start each record with the GSM_ID
        d = Dict{Symbol,Any}(:GSM_ID => row.GSM_ID)
        
        # Split the “Characteristics” string on ‘;’
        for part in split(row.Characteristics, ";")
            # Split each part into key and value (at most 2 pieces)
            kv = split(part, ":", limit = 2)
            if length(kv) == 2
                # Normalize the key to a Symbol (spaces → underscores)
                key = Symbol(replace(strip(kv[1]), r"\s+" => "_"))
                # Strip whitespace from the value
                d[key] = strip(kv[2])
            end
        end
        
        rows[i] = d
    end
    
    # Convert the array of Dicts into a DataFrame
    return DataFrame(rows)
end

parse_GSE64375 (generic function with 1 method)

In [112]:
GSE64375 = load_GSE(geoID, basepath=master, parser=parse_GSE64375)

meta = GSE64375.meta

Row,tissue,GSM_ID,volunteer,radiation_dose,time
Unnamed: 0_level_1,String31,String15,Int64,String7,String7
1,"blood, isolated leukocytes",GSM1569806,1,0.0 Gy,1 hr
2,"blood, isolated leukocytes",GSM1569807,2,0.0 Gy,1 hr
3,"blood, isolated leukocytes",GSM1569808,3,0.0 Gy,1 hr
4,"blood, isolated leukocytes",GSM1569809,4,0.0 Gy,1 hr
5,"blood, isolated leukocytes",GSM1569810,1,0.3 Gy,1 hr
6,"blood, isolated leukocytes",GSM1569811,2,0.3 Gy,1 hr
7,"blood, isolated leukocytes",GSM1569812,3,0.3 Gy,1 hr
8,"blood, isolated leukocytes",GSM1569813,4,0.3 Gy,1 hr
9,"blood, isolated leukocytes",GSM1569814,1,1.5 Gy,1 hr
10,"blood, isolated leukocytes",GSM1569815,2,1.5 Gy,1 hr


In [113]:
DataFrames.combine(groupby(meta, :radiation_dose), nrow => :count)

Row,radiation_dose,count
Unnamed: 0_level_1,String7,Int64
1,0.0 Gy,4
2,0.3 Gy,4
3,1.5 Gy,4
4,3.0 Gy,4
