In [1]:
import findspark
findspark.init()
findspark.find()

'/usr/local/spark/python/pyspark'

In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .master("yarn")
         .appName("python-testing")
         .config("spark.executor.instances", 1)
         .config("spark.executor.memory", "1g")
         .getOrCreate())
sc = spark.sparkContext
sc

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-04-20 13:18:03,978 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-04-20 13:18:08,562 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [20]:
reads = sc.textFile('hdfs:///files/salmonella/SRR15404285.fasta').filter(lambda x: not x.startswith('>')).zipWithIndex().map(lambda x: (x[1], x[0]))
reads.take(1)

                                                                                

[(0,
  'TGCCGNCCTGAGCGAAAGCCTGCTGGAAGAAGTAGCTTCGCTGGTGGAATGGCCGGTGGTATTGACGGCGAAATT')]

In [21]:
reads.count()

                                                                                

2912309

In [22]:
test_reads = reads.take(100)
test_reads

[(0,
  'TGCCGNCCTGAGCGAAAGCCTGCTGGAAGAAGTAGCTTCGCTGGTGGAATGGCCGGTGGTATTGACGGCGAAATT'),
 (1,
  'GGGTTNATCCAGACTTCATCCGGCACCGCCTCATGCAGCATCAGCACATTGCTGTAGGTCGAGTGGGTATGCCCT'),
 (2,
  'CCCAAAGATACGGGCGCAGAAAAGGCCGTCACGCTCAGGTTTGAACGTACGGTAGTTGATGGTTTCCGGCTTTTT'),
 (3,
  'CTCACGGAGAAAAGCGAAAATAAACGATTGACTCTGAAGCGGGAAAGCGTAATATGCACACCACGCGACGCTGAG'),
 (4,
  'TGACCGTTTACGCGCCTGCCGTACCGCGCAGGAAGTCCTGGATCTCATTGACCGCACCAACGCGGCAGCTTAAGA'),
 (5,
  'AACAATACATTAGTTTCCAGCGAATTGCTGCCATCTGCTGGAAAAAAGGGGCCATGAAGGCCCCCTCTTTCTGAA'),
 (6,
  'ACGGATCAGCCTGAGCGCCAGCGTGCGTATGACGCACACGGCGTCGCCTGTGCCGTTGAGCACAATGTGATATGA'),
 (7,
  'CTCATCGAGCTCACAGCACATGCGCTTTTGTGTACGGGGCTGTCACCCTGTATCGCGCGCCTTTCCAGACGCTTC'),
 (8,
  'CGGTTTCGGTTTATGCCTGATGAACCTCCCGCGCCATTCCCCGGTGCGGAATTCCGTCTTCGTCATAATCATCGG'),
 (9,
  'CGCCTGCAAGGTGCCCATCACGCCAACAACCGGTCCGACGATACCGGCGGTACGGCAGTTGCGTTCAGGCTCGAC'),
 (10,
  'GAATGAAGTGCGCAGCGTGCAGGAAAAGCTGGAAAAAGCGCTCTCGCAGGTGGCAAATGAACCTATTAACGTGTT'),
 (11,
  'TGCAGTGCATCAGGGAACAGAAATCCCCCAGAA

In [6]:
#test_reads = ['TGCCGNCCTGAGCGAAAGCCTGCTGGAAGAAGTAGCTTCGCTGGTGGAATGGCCGGTGGTATTGACGGCGAAATT',
# 'GGGTTNATCCAGACTTCATCCGGCACCGCCTCATGCAGCATCAGCACATTGCTGTAGGTCGAGTGGGTATGCCCT',
# 'CCCAAAGATACGGGCGCAGAAAAGGCCGTCACGCTCAGGTTTGAACGTACGGTAGTTGATGGTTTCCGGCTTTTT',
# 'CTCACGGAGAAAAGCGAAAATAAACGATTGACTCTGAAGCGGGAAAGCGTAATATGCACACCACGCGACGCTGAG',
# 'TGACCGTTTACGCGCCTGCCGTACCGCGCAGGAAGTCCTGGATCTCATTGACCGCACCAACGCGGCAGCTTAAGA']

In [7]:
import sys
sys.path.append('/home/ubuntu/.local/lib/python3.8/site-packages')
sys.path.append('/home/ubuntu/GenASM/build/lib.linux-x86_64-3.8') # For the homebrew GenASM bindings in C. These need to be added to the datanodes too.

import json
import pickle
import base64

import gasm # Homebrew package
import datasketch as ds
import hbase_connector

In [8]:
pool = hbase_connector.HbaseConnection(host="datanode1", port=9090)
# Table is from run_insert.sh
lsh = ds.lsh.MinHashLSH(storage_config={'type': 'hbase', 'basename': b'hbase_salmonella_windowed', 'hbase_pool': pool}, prepickle=True)

In [9]:
def create_hash(string, nlet=3):
    mh2 = ds.MinHash()
    for i in range(nlet, len(string)):
        v = string[i-nlet:i]
        mh2.update(v.encode('utf8'))
    return mh2

In [23]:
%%time
for read in test_reads:
    h = create_hash(read[1], 3)
    lsh.query(h)
    #print(lsh.query(h))

CPU times: user 561 ms, sys: 19.3 ms, total: 580 ms
Wall time: 1.51 s


## GenASM docs:
```python
genasm_aligner(<reference sequence>,
               <query sequence>,
               <edit distance threshold>,
               <match score>,
               <substitution penalty>,
               <gap-opening penalty>,
               <gap-extension penalty>)
```

In [11]:
gasm.__dict__

{'__name__': 'gasmmodule',
 '__doc__': 'gasm Module',
 '__package__': '',
 '__loader__': <_frozen_importlib_external.ExtensionFileLoader at 0x7f7642203b50>,
 '__spec__': ModuleSpec(name='gasm', loader=<_frozen_importlib_external.ExtensionFileLoader object at 0x7f7642203b50>, origin='/home/ubuntu/GenASM/build/lib.linux-x86_64-3.8/gasm.cpython-38-x86_64-linux-gnu.so'),
 'gasmAlignment': <function gasmmodule.gasmAlignment>,
 'version': <function gasmmodule.version>,
 '__file__': '/home/ubuntu/GenASM/build/lib.linux-x86_64-3.8/gasm.cpython-38-x86_64-linux-gnu.so'}

In [12]:
# genasm_aligner(<reference sequence>, <query sequence>, <edit distance threshold>, <match score>, <substitution penalty>, <gap-opening penalty>, <gap-extension penalty>)
gasm.gasmAlignment("AATGTCC", "ATATGTCC", 3, 3, 4, 5, 1)

(1, 15, '1M1I6M', '1M1I6M', '7')

In [27]:
#%%time
debug = []

for read in test_reads:
    h = create_hash(read[1], 3)
    candidates = lsh.query(h)
    #print(candidates)
    if len(candidates) > 0:
        scores = [(cand, read, gasm.gasmAlignment(read[1], cand[1], 30, 30, 4, 5, 1)) for cand in candidates]
        #scores = [(cand[0], cand[1], cand[1]) for cand in candidates]
        scores.sort(key=lambda x: x[2][0])
        scores = list(filter(lambda x: x[2][2] != '', scores))
        #print(scores)
        if len(scores) > 0:
            if scores[0][2][3] != '':
                debug.append(scores[0])

In [28]:
debug #(candidate, read, comparison)

[((4350896,
   'AGACCAGAACCTCACGGAGAAAAGCGAAAATAAACGCTTGACTCTGAAGCGGGAAAGCGTAATATGCACACCCCG'),
  (3,
   'CTCACGGAGAAAAGCGAAAATAAACGATTGACTCTGAAGCGGGAAAGCGTAATATGCACACCACGCGACGCTGAG'),
  (12, 1867, '10I26M1S35M1S2M', '10I65M', '26A35A2')),
 ((4627367,
   'TGACCGTTTACGCGCCTGCCGTACCGCGCAGGAAGTCCTGGATCTCATTGACCGCACCAACGCGGCAGCTTAAGA'),
  (4,
   'TGACCGTTTACGCGCCTGCCGTACCGCGCAGGAAGTCCTGGATCTCATTGACCGCACCAACGCGGCAGCTTAAGA'),
  (0, 2250, '75M', '75M', '75')),
 ((3451544,
   'AACAATACATTAGTTTCCAGCGAATTGCTGCCATCTGCTGGAAAAAAGGGGCCATGAAGGCCCCCTCTTTCTGAA'),
  (5,
   'AACAATACATTAGTTTCCAGCGAATTGCTGCCATCTGCTGGAAAAAAGGGGCCATGAAGGCCCCCTCTTTCTGAA'),
  (0, 2250, '75M', '75M', '75')),
 ((4406970,
   'ACGGATCAGCCTGAGCGCCAGCGTGCGTATGACGCACACGGCGTCGCCTGTGCCGTTGAGCACAATGTGATATGA'),
  (6,
   'ACGGATCAGCCTGAGCGCCAGCGTGCGTATGACGCACACGGCGTCGCCTGTGCCGTTGAGCACAATGTGATATGA'),
  (0, 2250, '75M', '75M', '75')),
 ((2799395,
   'CTCATCGAGCTCACAGCACATGCGCTTTTGTGTACGGGGCTGTCACCCTGTATCGCGCGCCTTTCCAGACGCTTC'),
  (7,
   'CT

In [None]:
table = {}

for v in debug:
    index_read = v[0]
    candidate = v[1]
    comparison = v[2]
    read = v[3]
    
    
    
