In [None]:
## Python Package Import
import sys
import os 
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
%env JOB_NAME=calculate_ld_scores

In [None]:
input_dir = 'meta_analysis_mixed'

# creating task file for parallelization
PARAMETER_FILENAME = 'gs~/data/cov-ldsc/task_files/calculate_LD_scores.tsv'

df = pd.DataFrame(data={
    '--env CHR': range(1,23,1),
    '--input BED': ['gs~/data/cov-ldsc/ldsc_input/' + input_dir + '/chr' + str(f) + '_map_unrelated.bed'
                             for f in range(1,23,1)],
    '--input BIM': ['gs~/data/cov-ldsc/ldsc_input/' + input_dir + '/chr' + str(f) + '_map_unrelated.bim'
                             for f in range(1,23,1)],
    '--input FAM': ['gs~/data/cov-ldsc/ldsc_input/' + input_dir + '/chr' + str(f) + '_map_unrelated.fam'
                             for f in range(1,23,1)]
})
df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

In [None]:
%env PARAMETER_FILENAME={PARAMETER_FILENAME}
%env BIOBANK={input_dir}

In [None]:
%%writefile ~/aou_dsub.bash

#!/bin/bash

# This shell function passes reasonable defaults for several dsub parameters, while
# allowing the caller to override any of them. It creates a nice folder structure within
# the workspace bucket for dsub log files.

# --[ Parameters ]--
# any valid dsub parameter flag

#--[ Returns ]--
# the job id of the job created by dsub

#--[ Details ]--
# The first five parameters below should always be those values when running on AoU RWB.

# Feel free to change the values for --user, --regions, --logging, and --image if you like.

# Note that we insert some job data into the logging path.
# https://github.com/DataBiosphere/dsub/blob/main/docs/logging.md#inserting-job-data

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

In [None]:
%%bash

echo source ~/ldsc_dsub.bash >> ~/.bashrc

In [None]:
%%bash --out JOB_NAME

source ~/aou_dsub.bash

MACHINE_TYPE="n2-standard-4"

aou_dsub \
    --tasks "${PARAMETER_FILENAME}" \
    --machine-type ${MACHINE_TYPE} \
    --image "gcr.io/escalator-docker-image/cov_ldsc:latest" \
    --name "${JOB_NAME}" \
    --output-recursive output="${WORKSPACE_BUCKET}/data/cov-ldsc/LD_scores/${BIOBANK}" \
    --input COV="gs~/data/cov-ldsc/ldsc_input/${BIOBANK}_plink2.eigenvec" \
    --input BEDfile=${BED} \
    --input BIMfile=${BIM} \
    --input FAMfile=${FAM} \
    --command 'set -o errexit && \
               set -o xtrace && \
               python /opt/ldsc.py \
                     --bfile /mnt/data/input/gs/fc-secure-77508628-062c-4135-b1e6-9835e6463584/data/cov-ldsc/ldsc_input/meta_analysis_mixed/chr${CHR}_map_unrelated \
                     --l2 --ld-wind-cm 20 \
                     --cov ${COV} \
                     --out "${output}/${CHR}"'

In [None]:
# to make JOB_ID available from %%bash cells
%env JOB_ID={JOB_NAME}

In [None]:
%%bash

dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs "${JOB_ID}" \
    --users "${USER_NAME}" \
    --status '*' \
#    --full