In [1]:
import $file.^.init.spark, spark._
import $file.^.init.paths, paths._
import $file.^.init.glow, glow._
import $file.^.init.benchmark, benchmark._
import $file.^.init.functions, functions._
import $file.^.init.{plotly => init_plotly}, init_plotly._
import sys.process._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import io.projectglow.Glow
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond.{init => init_plotly_js, _}
import better.files.File
import org.apache.log4j.{Level, Logger}
Logger.getLogger("io.projectglow.plink").setLevel(Level.WARN)

def timeop[T](op: String)(block: => T) = optimer("glow", op, block)

// Increase broadcast timeout from 300 to avoid "Could not execute broadcast in 300 secs."
val ss = getLocalSparkSession(
    broadcastTimeoutSeconds=14400, 
    shufflePartitions=1,
    enableUI=true
) 
import ss.implicits._
Glow.register(ss)

init_plotly_js(offline=false)

val data_dir = GWAS_TUTORIAL_DATA_DIR / "2_PS_GWAS"
//val PS1_1KG_RAW_FILE = "ALL.2of4intersection.20100804.genotypes_no_missing_IDs"
val PS1_1KG_RAW_FILE = "ALL.2of4intersection.20100804.genotypes"

Loading spark-stubs


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/eczech/.cache/coursier/v1/https/repo1.maven.org/maven2/org/slf4j/slf4j-log4j12/1.7.16/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/eczech/.cache/coursier/v1/https/repo1.maven.org/maven2/org/slf4j/slf4j-log4j12/1.7.25/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]


Creating SparkSession


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
20/02/06 16:26:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[32mimport [39m[36m$file.$           , spark._
[39m
[32mimport [39m[36m$file.$           , paths._
[39m
[32mimport [39m[36m$file.$          , glow._
[39m
[32mimport [39m[36m$file.$               , benchmark._
[39m
[32mimport [39m[36m$file.$               , functions._
[39m
[32mimport [39m[36m$file.$                             , init_plotly._
[39m
[32mimport [39m[36msys.process._
[39m
[32mimport [39m[36morg.apache.spark.sql.DataFrame
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36mio.projectglow.Glow
[39m
[32mimport [39m[36mplotly._
[39m
[32mimport [39m[36mplotly.element._
[39m
[32mimport [39m[36mplotly.layout._
[39m
[32mimport [39m[36mplotly.Almond.{init => init_plotly_js, _}
[39m
[32mimport [39m[36mbetter.files.File
[39m
[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m
defined [32mfunction[39m [36mtimeop[39m
[36mss[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql

### Convert 1KG to Parquet

In [6]:
val df = ss.read.format("plink")
//     .option("famDelimiter", "\t")
//     .option("bimDelimiter", "\t")
    .load(data_dir / PS1_1KG_RAW_FILE + ".bed" toString)
df.printSchema

root
 |-- contigName: string (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- position: double (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genotypes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- sampleId: string (nullable = true)
 |    |    |-- calls: array (nullable = true)
 |    |    |    |-- element: integer (containsNull = true)



[36mdf[39m: [32mDataFrame[39m = [contigName: string, names: array<string> ... 6 more fields]

In [7]:
timeop("ps0") {
    df.write
    .mode("overwrite")
    .parquet(data_dir / PS1_1KG_RAW_FILE + ".parquet" toString)
}

Elapsed time: 1099.5 seconds


### 1KG QC 

In [8]:
val df = ss.read.parquet(data_dir / PS1_1KG_RAW_FILE + ".parquet" toString)
df.count()

[36mdf[39m: [32mDataFrame[39m = [contigName: string, names: array<string> ... 6 more fields]
[36mres7_1[39m: [32mLong[39m = [32m25488488L[39m

In [9]:
df.groupBy(size($"genotypes")).count.show

+---------------+--------+
|size(genotypes)|   count|
+---------------+--------+
|            629|25488488|
+---------------+--------+



In [8]:
df.groupBy(size($"genotypes")).count.show

+---------------+--------+
|size(genotypes)|   count|
+---------------+--------+
|            629|25488488|
+---------------+--------+



In [2]:
// To take a sample efficiently, this appears to be the best way
// (.limit will scan every paritition first)
// val df = ss.read.parquet(data_dir / PS1_1KG_RAW_FILE + ".parquet" toString)
//     .transform(d => {ss.createDataFrame(d.takeAsList(100), d.schema)})

In [21]:
// QC functions

def count(df: DataFrame) = (df.count, df.select(size(col("genotypes"))).first.getAs[Int](0)) // (n_variants, n_samples)

// def filterBySampleCallRate(threshold: Double)(df: DataFrame): DataFrame = { 
//     df
//         // Cross join original dataset with single-row data frame containing a map like (sampleId -> QC stats)
//         .crossJoin(
//             df
//             .selectExpr("sample_call_summary_stats(genotypes, referenceAllele, alternateAlleles) as qc")
//             .selectExpr("map_from_arrays(qc.sampleId, qc) as qc")
//         )
//         // For each row, filter the genotypes array (which has one element per sampleId) based on QC map lookup
//         .selectExpr("*", s"filter(genotypes, g -> qc[g.sampleId].callRate >= ${threshold}) as filtered_genotypes")
//         // Remove intermediate fields 
//         .drop("qc", "genotypes").withColumnRenamed("filtered_genotypes", "genotypes")
//         // Ensure that the original dataset schema was preserved
//         .transform(d => {assert(d.schema.equals(df.schema)); d})
// }

// Rewrite based on https://github.com/projectglow/glow/issues/148#issuecomment-582485763
def filterBySampleCallRate(threshold: Double)(df: DataFrame): DataFrame = {
  val qc = df.selectExpr("sample_call_summary_stats(genotypes, referenceAllele, alternateAlleles) as qc")
  df.crossJoin(qc)
    // For each row, filter the genotypes array (which has one element per sampleId) based on whether it passed the QC filter
    .selectExpr("*", s"""
        transform(
            filter(
                zip_with(sequence(0, size(genotypes)), genotypes, (i, g) -> (i, g)), e -> 
                qc[e.i].callRate >= $threshold
            ), 
            e -> e.g
        ) as filtered_genotypes
    """)
    // Remove intermediate fields 
    .drop("qc", "genotypes")
    .withColumnRenamed("filtered_genotypes", "genotypes")
    // Ensure that the original dataset schema was preserved
    .transform(d => {assert(d.schema.equals(df.schema)); d})
}

def filterByVariantCallRate(threshold: Double)(df: DataFrame): DataFrame = { 
    df
        .selectExpr("*", "call_summary_stats(genotypes) as qc")
        .filter(col("qc.callRate") >= threshold)
        .drop("qc")
        .transform(d => {assert(d.schema.equals(df.schema)); d})
}

defined [32mfunction[39m [36mcount[39m
defined [32mfunction[39m [36mfilterBySampleCallRate[39m
defined [32mfunction[39m [36mfilterByVariantCallRate[39m

In [22]:
// Ran for ~3 hours (with both 1 and 200 shuffle partitions) and 
// used 30g of 64g heap before being killed.
val df_qc1 = df
    .transform(filterByVariantCallRate(threshold=.8))
    .transform(filterBySampleCallRate(threshold=.8))
    .transform(filterByVariantCallRate(threshold=.98))
    .transform(filterBySampleCallRate(threshold=.98))

val ct = timeop("ps1") {
    count(df_qc1)
}
ct

: 

### Liftover

In [None]:
Get hg18 reference:
- Download ```chromFa.zip``` from http://hgdownload.cse.ucsc.edu/goldenPath/hg18/bigZips/chromFa.zip
- See notes and other downloads at same location: http://hgdownload.cse.ucsc.edu/goldenpath/hg18/chromosomes/
- Unzip and run ```rm *_random*``` to delete ```chrN_random.fa``` files
    - "The chrN_random.fa files contain clones that are not yet finished or cannot be placed with certainty at a specific place on the chromosome"
- Concatenate all the per-chromosome files together: ```