In [1]:
import re 
import json
from functools import reduce
from collections import OrderedDict

import hail
import dask.dataframe as dd
from dask.distributed import Client
import pyarrow as pa
import pandas as pd
from google.cloud import storage

from data_pipeline.datasets.tob import helpers

In [2]:
client = storage.Client()
bucket = client.bucket(helpers.get_gcp_bucket_name())

pattern = rf"{helpers.build_analaysis_input_path(absolute_path=False)}/Genotypes/genotype_chr22.tsv"
chr22_genotype_files = [
    b.name
    for b in bucket.list_blobs()
    if re.search(pattern, b.name)
]

pattern = rf"{helpers.build_analaysis_input_path(absolute_path=False)}/Residuals/(.*)_chr22_log_residuals.tsv"
chr22_expression_files = [
    b.name
    for b in bucket.list_blobs()
    if re.search(pattern, b.name)
]

cell_types = [p.split('/')[-1].split('_')[0] for p in chr22_expression_files]

# Dask
Attempt to process tables from wide to long format using Dask, saving to parquet.

In [3]:
client = Client(n_workers=2, threads_per_worker=2)

distributed.diskutils - INFO - Found stale lock file and directory '/home/daniel/git/exome-results-browsers/data_pipeline/dask-worker-space/worker-otm9g5wk', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/daniel/git/exome-results-browsers/data_pipeline/dask-worker-space/worker-i1yn2sg8', purging


In [4]:
for chrom in list(range(1, 23)):
    print(f"Loading wide dataframe for chrom '{chrom}'")
    wide_df: dd.DataFrame = dd.read_table(
        f"{helpers.build_analaysis_input_path(absolute_path=True)}/Genotypes/genotype_chr{chrom}.tsv",
        blocksize=250e6,
        sample=6000000
    )

    print("Melting to long dataframe")
    long_df = wide_df\
        .rename(columns={"sampleid": "sample_id"})\
        .melt(
            id_vars=["sample_id"],
            var_name="snp_id",
            value_name="genotype",
        )
    
    print("Applying column methods")
    long_df["chrom"] = str(chrom)
    long_df["bp"] = long_df["snp_id"].apply(
        lambda x: int(x.split(":")[1].split("_")[0]), 
        meta=int
    )
    long_df["global_bp"] = long_df["bp"].apply(
        lambda x: helpers.local_to_global_coordinates(x, chrom, reference=helpers.get_reference_genome()), 
        meta=int
    )

    print("Saving to parquet")
    long_df.to_parquet(
        path=f"{helpers.build_output_path()}/genotypes/genotypes.parquet",
        engine="fastparquet",
        overwrite=True if chrom == 1 else False,
        append=False if chrom == 1 else True,
        partition_on="chrom",
        schema={
            "sample_id": pa.string(), 
            "snp_id": pa.string(),
            "chrom": pa.string(),
            "bp": pa.int64(),
            "global_bp": pa.int64(),
            "genotype": pa.int8(),
        }
    )

Loading wide dataframe for chrom '1'
Melting to long dataframe
Applying column methods
Saving to parquet


In [None]:
mapping_file = f"{helpers.build_output_path()}/metadata/gene_symbol_to_id.json".replace(f"gs://{helpers.get_gcp_bucket_name()}/", "")
blob = bucket.get_blob(mapping_file)
symbol_mapping = json.loads(blob.download_as_string()) if blob else None

for chrom in [22]: #list(range(1, 23))[0]:
    for cell_type in cell_types:
        print("Loading wide dataframe")
        wide_df: dd.DataFrame = dd.read_table(
            f"{helpers.build_analaysis_input_path(absolute_path=True)}/Residuals/{cell_type}_chr{chrom}_*.tsv",
            blocksize=250e6,
            sample=1000000
        )

        print("Melting to long dataframe")
        long_df = wide_df\
            .rename(columns={"sampleid": "sample_id"})\
            .melt(
                id_vars=["sample_id"],
                var_name="gene_symbol",
                value_name="log_cpm",
            )
        
        print("Applying column methods")
        long_df["gene_id"] = long_df["gene_symbol"].apply(lambda x: symbol_mapping[x], meta=str)
        long_df["chrom"] = str(chrom)
        long_df["cell_type_id"] = cell_type

        print("Saving to parquet")
        long_df.to_parquet(
            path=f"{helpers.build_output_path()}/expression/expression.parquet",
            engine="fastparquet",
            overwrite=True if chrom == 1 else False,
            append=False if chrom == 1 else True,
            partition_on="cell_type_id",
            schema={
                "sample_id": pa.string(), 
                "gene_id": pa.string(),
                "cell_type_id": pa.string(),
                "gene_symbol": pa.string(),
                "chrom": pa.string(),
                "log_cpm": pa.float64(),
            }
        )

In [6]:
def read_table(path, bucket, row_keys, verbose=False, annotations=None):
    relative_path = path.replace(f"gs://{bucket.name}/", "")
    blob = bucket.get_blob(relative_path)

    columns = []
    with blob.open("r") as handle:
        columns = handle.readline().split("\t")
    columns = [c.strip() for c in columns if c and c.strip()]

    row_fields = OrderedDict()
    for col in columns:
        row_fields[col] = hail.tstr if col in row_keys else hail.tfloat

    full_path = f"gs://{bucket.name}/{blob.name}"
    if verbose:
        print(f"Loading from path '{full_path}'")
        
    table = hail.import_table(full_path, types=row_fields, delimiter="\t")
    
    if annotations:
        table = table.annotate(**annotations)
        row_fields.update({k: hail.tstr for k in annotations.keys()})
    
    return table.key_by(*row_keys), row_fields


def merge_tables(tables, unify=True):
    return reduce(lambda a, b: a.union(b, unify=unify), tables[1:], tables[0])


def melt(df):
    return (
        df
        .melt(
            id_vars=["sampleid"],
            var_name="gene_symbol",
            value_name="log_residual",
        )
    )


In [8]:
# expression_table = merge_tables([
#     read_table(
#         path=p, 
#         bucket=bucket, 
#         row_keys=["sampleid", "cell_type_id"], 
#         annotations={"cell_type_id": p.split("/")[-1].split("_")[0]}
#     )[0] 
#     for p in chr22_expression_files[0:2]
# ])

# genotype_table = merge_tables([
#     read_table(path=p, bucket=bucket, row_keys=["sampleid"])[0] 
#     for p in chr22_genotype_files
# ])

# genotype_df = pd.read_table(f"gs://{helpers.get_gcp_bucket_name()}/{chr22_genotype_files[0]}", delimiter="\t")
# expression_df = pd.read_table(f"gs://{helpers.get_gcp_bucket_name()}/{chr22_expression_files[0]}", delimiter="\t")

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder \
    .master("local[1]") \
    .appName("tob-wgs") \
    .getOrCreate()

spark\
    ._jsc\
    .hadoopConfiguration()\
    .set("google.cloud.auth.service.account.json.keyfile", "/home/daniel/keys/sa.key")

table = spark\
    .read\
    .options(header=True, delimter="\t", inferSchema=True)\
    .csv(f"gs://{helpers.get_gcp_bucket_name()}/{chr22_expression_files[0]}",
    sep="\t"
)

genes = [c for c in table.columns if c not in ["sampleid", "cell_type_id"]]



In [None]:
from pyspark.sql import DataFrame

rdd = spark.\
    sparkContext.\
    parallelize(genes)

def compute_histogram(gene: str, table: DataFrame):
    return table\
        .select(gene)\
        .rdd\
        .values()\
        .histogram(30)


In [25]:
def compute_histogram(gene):
    table = spark\
        .read\
        .options(header=True, delimter="\t", inferSchema=True)\
        .csv(f"gs://{helpers.get_gcp_bucket_name()}/{chr22_expression_files[0]}",
        sep="\t"
    )

    return table\
        .select(gene)\
        .rdd\
        .flatMap(lambda x: x)\
        .histogram(30)

histograms = rdd.map(compute_histogram)