# Non Reference Gene Lookup Preparation

Will convert genes_with_non_reference_ensembl_ids.tsv to form amenable for matching (instead of running these transformations in all pipelines).

Generates genes_with_non_reference_ensembl_ids_lkp.tsv

In [1]:
import $file.^.SparkInit, SparkInit._
import ss.implicits._
import org.apache.spark.sql.functions._
import java.nio.file.Paths

Loading spark-stubs
Creating SparkSession


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/12/09 19:40:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[32mimport [39m[36m$file.$          , SparkInit._
[39m
[32mimport [39m[36mss.implicits._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36mjava.nio.file.Paths[39m

In [2]:
val path = Paths.get(System.getProperty("user.home"), "data", "ot", "extract", "resources", "genes_with_non_reference_ensembl_ids.tsv")
//val path = Paths.get(System.getProperty("user.home"), "repos", "ot", "data_pipeline", "mrtarget", "resources", "genes_with_non_reference_ensembl_ids.tsv")
val df = ss.read
    .format("csv")
    .option("sep", "\t")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(path.toString)
df

[36mpath[39m: [32mjava[39m.[32mnio[39m.[32mfile[39m.[32mPath[39m = /home/eczech/data/ot/extract/resources/genes_with_non_reference_ensembl_ids.tsv
[36mdf[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mpackage[39m.[32mDataFrame[39m = [gene_symbol: string, ensembl_gene_id: string ... 3 more fields]
[36mres1_2[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mpackage[39m.[32mDataFrame[39m = [gene_symbol: string, ensembl_gene_id: string ... 3 more fields]

In [3]:
df.show(3)

+-----------+---------------+--------+-------------------+------------+
|gene_symbol|ensembl_gene_id|assembly|         chromosome|is_reference|
+-----------+---------------+--------+-------------------+------------+
|     ABCB11|ENSG00000073734|  GRCh38|                  2|           t|
|     ABCB11|ENSG00000276582|  GRCh38|CHR_HSCHR2_1_CTG7_2|           f|
| AC130343.1|ENSG00000274119|  GRCh38| CHR_HSCHR17_1_CTG2|           f|
+-----------+---------------+--------+-------------------+------------+
only showing top 3 rows



In [4]:
df.groupBy("assembly").count().show

+--------+-----+
|assembly|count|
+--------+-----+
|  GRCh38|  561|
+--------+-----+



In [5]:
// Lift the transformations from:
// https://github.com/opentargets/data_pipeline/blob/329ff219f9510d137c7609478b05d358c9195579/mrtarget/common/EvidenceString.py#L517
// This will create a mapping from alternate -> reference (gene_symbol is just for context and grouping)
val dfp = df.groupBy("gene_symbol").agg(
    element_at(collect_set(when($"is_reference" === "t", $"ensembl_gene_id")), -1).as("reference"),
    collect_set(when($"is_reference" === "f", $"ensembl_gene_id")).as("alternate")
)
dfp.show()

+-----------+---------------+--------------------+
|gene_symbol|      reference|           alternate|
+-----------+---------------+--------------------+
|     ABCB11|ENSG00000073734|   [ENSG00000276582]|
| AC130343.1|           null|   [ENSG00000274119]|
| AC203639.1|           null|   [ENSG00000276973]|
| AC239612.1|           null|   [ENSG00000276849]|
| AC239618.2|           null|   [ENSG00000276971]|
| AC243807.1|           null|   [ENSG00000274690]|
| AC246787.8|           null|   [ENSG00000277633]|
|       AGER|ENSG00000204305|   [ENSG00000237405]|
|        AGK|ENSG00000006530|   [ENSG00000262327]|
|     AGPAT1|ENSG00000204310|   [ENSG00000206324]|
|       AIF1|           null|[ENSG00000237727,...|
|       AKT3|ENSG00000117020|   [ENSG00000275199]|
| AL928742.2|           null|   [ENSG00000274497]|
| AL928742.3|           null|   [ENSG00000277016]|
|      ALOX5|ENSG00000012779|   [ENSG00000275565]|
|       APOM|ENSG00000204444|[ENSG00000231974,...|
|      ATAT1|ENSG00000137343|  

[36mdfp[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mpackage[39m.[32mDataFrame[39m = [gene_symbol: string, reference: string ... 1 more field]

In [6]:
val dfe = dfp
    // Explode by alternate id nothing that it is ok if empty 
    // arrays cause dropped rows since alternate id is field to join on
    .select($"gene_symbol", $"reference", explode($"alternate").as("alternate"))
    .filter($"reference".isNotNull)
    .dropDuplicates()
dfe.show()

+-----------+---------------+---------------+
|gene_symbol|      reference|      alternate|
+-----------+---------------+---------------+
|     ABCB11|ENSG00000073734|ENSG00000276582|
|       AGER|ENSG00000204305|ENSG00000237405|
|        AGK|ENSG00000006530|ENSG00000262327|
|     AGPAT1|ENSG00000204310|ENSG00000206324|
|       AKT3|ENSG00000117020|ENSG00000275199|
|      ALOX5|ENSG00000012779|ENSG00000275565|
|       APOM|ENSG00000204444|ENSG00000231974|
|       APOM|ENSG00000204444|ENSG00000235754|
|       APOM|ENSG00000204444|ENSG00000226215|
|       APOM|ENSG00000204444|ENSG00000206409|
|       APOM|ENSG00000204444|ENSG00000224290|
|       APOM|ENSG00000204444|ENSG00000227567|
|      ATAT1|ENSG00000137343|ENSG00000223752|
|        B2M|ENSG00000166710|ENSG00000273686|
|       BAG6|ENSG00000204463|ENSG00000234651|
|       BRD2|ENSG00000204256|ENSG00000235307|
|      BTNL2|ENSG00000204290|ENSG00000225412|
|      BTNL2|ENSG00000204290|ENSG00000226127|
|      BTNL2|ENSG00000204290|ENSG0

[36mdfe[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [gene_symbol: string, reference: string ... 1 more field]

In [7]:
val path = Paths.get(
    System.getProperty("user.home"), "data", "ot", "extract", 
    "resources", "genes_with_non_reference_ensembl_ids_lkp.tsv")
dfe.coalesce(1).write.format("csv").option("header", "true").save(path.toString)

[36mpath[39m: [32mjava[39m.[32mnio[39m.[32mfile[39m.[32mPath[39m = /home/eczech/data/ot/extract/resources/genes_with_non_reference_ensembl_ids_lkp.tsv