In [1]:
# https://spark.apache.org/docs/latest/rdd-programming-guide.html
# https://sib-swiss.github.io/sib-tech-blog/spark/analytics/2017/08/21/bioinformatics-with-spark.html


# Maybe paper: https://www.sciencedirect.com/science/article/pii/S1532046413001007
# Maybe: https://pubmed.ncbi.nlm.nih.gov/21645377/

In [1]:
import findspark
findspark.init()
findspark.find()

'/usr/local/spark/python/pyspark'

In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .master("yarn")
         .appName("python-testing")
         .config("spark.executor.instances", 14)
         #.config("spark.dynamicAllocation.enabled", "true")
         .config("spark.executor.cores", 1)
         #.config("spark.dynamicAllocation.minExecutors", 4)
         #.config("spark.dynamicAllocation.maxExecutors", 32)
         #.config("spark.shuffle.service.enabled", "true")
         #.config("spark.shuffle.service.port", 7337)
         .getOrCreate())


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-03-25 17:56:30,695 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-03-25 17:56:35,639 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


KeyboardInterrupt: 

In [4]:
sequence_file = spark.read.text("/files/sequence.fasta")
sequence_file

DataFrame[value: string]

In [5]:
filtered = sequence_file.rdd.filter(lambda x: not x.value.startswith('>'))
filtered = filtered.sample(False, 0.1, 1)
filtered.take(5)

                                                                                

[Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN')]

In [5]:
reduced = filtered.flatMap(lambda x: [c for c in x.value])
reduced.take(5)

                                                                                

['N', 'N', 'N', 'N', 'N']

In [6]:
# from pyspark.streaming import StreamingContext

In [7]:
# stream = StreamingContext(spark.sparkContext, 5)

In [8]:
filtered.zipWithIndex().take(5)

                                                                                

[(Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
  0),
 (Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
  1),
 (Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
  2),
 (Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
  3),
 (Row(value='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
  4)]

In [9]:
filtered.zipWithIndex().map(lambda x: (x[1] // 310, x[0])).reduceByKey(lambda v0, v1: v0 + v1).mapValues(lambda v: str(type(v[1]))).take(1)

                                                                                

[(308, "<class 'str'>")]

In [11]:
filtered.map(lambda x: x.value).take(1)

['NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN']

In [12]:
def zipWithIndex(self):
    """
    Yeeted directly from spark
    """
    starts = [0]
    if self.getNumPartitions() > 1:
        nums = self.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
        for i in range(len(nums) - 1):
            starts.append(starts[-1] + nums[i])

    def func(k, it):
        for i, v in enumerate(it, starts[k]):
            yield v, i

    return self.mapPartitionsWithIndex(func)

In [13]:
starts = [0]
nums = filtered.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
for i in range(len(nums) - 1):
    starts.append(starts[-1] + nums[i])

def func(partition_index, iterator_v):
    for i, v in enumerate(iterator_v, starts[partition_index]):
        yield i, v
        
values = filtered.mapPartitionsWithIndex(func).mapValues(lambda x: x.value)

                                                                                

In [15]:
def flatMapper(x):
    k, v = x
    for i in range(len(v)):
        yield k * 60 + i, v[i]
        
indexed = values.flatMap(flatMapper)

In [16]:
indexed.map(lambda x: (x[0] // 155, x[1])).reduceByKey(lambda a0, a1: a0 + a1).take(2)

                                                                                

[(368005,
  'GATAGATGATATGTACTGATAGACAATAGATTAATTATAGAAAGAAAGAAAGAAAAAAGAGATAGGGAGAGAGAAGGAAGGAAGAGAGGAACGAAAAAAGGAAGGGAAGGAGGAAGGAAGAGAGGGAGGCAGAAAAGAGAAGGAAAATGTGTGAA'),
 (368016,
  'ACAAATCTGTACATATTGTCATGGGAGCATCTGGCAGTAGTCAGTGCTTTATGGTCATTGGTTATTAAGTTCAATCATGTCTCACATAACGACTGCATATATGACGGTGGTCCCATAAGATTATCATACCACAGATTTACCATACATTTTCTATG')]

In [17]:
indexed.flatMap??

[0;31mSignature:[0m [0mindexed[0m[0;34m.[0m[0mflatMap[0m[0;34m([0m[0mf[0m[0;34m,[0m [0mpreservesPartitioning[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mflatMap[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mf[0m[0;34m,[0m [0mpreservesPartitioning[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""[0m
[0;34m        Return a new RDD by first applying a function to all elements of this[0m
[0;34m        RDD, and then flattening the results.[0m
[0;34m[0m
[0;34m        Examples[0m
[0;34m        --------[0m
[0;34m        >>> rdd = sc.parallelize([2, 3, 4])[0m
[0;34m        >>> sorted(rdd.flatMap(lambda x: range(1, x)).collect())[0m
[0;34m        [1, 1, 1, 2, 2, 3][0m
[0;34m        >>> sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect())[0m
[0;34m        [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)][0m
[0;34m        """[0m[0;34m[0

In [18]:
def testMap(f):
    def func(s, iterator):
            return chain.from_iterable(map(fail_on_stopiteration(f), iterator))
    return self.mapPartitionsWithIndex(func, preservesPartitioning)

In [19]:
#import itertools
#itertools.fail_on_stopiteration

In [20]:
import pyspark

def printTest(self):
    print("hallo")
    
pyspark.rdd.RDD.printTest = printTest

In [21]:
values.printTest()

hallo


In [22]:
def getPartitionItemStartIndex(self):
    starts = [0]
    nums = self.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
    for i in range(len(nums) - 1):
        starts.append(starts[-1] + nums[i])
    return starts

pyspark.rdd.RDD.getPartitionItemStartIndex = getPartitionItemStartIndex

In [23]:
values.getPartitionItemStartIndex()

                                                                                

[0,
 95065,
 190130,
 285196,
 380261,
 475327,
 570392,
 665458,
 760523,
 855589,
 950654]

start_indexes = values.getPartitionItemStartIndex()

def customPartitioningFunction(index, iterator):
    i = 0
    start_index = 0
    start_key = None
    
    
    values = []
    
    
    for item in iterator:
        item_index = start_indexes[index] + i
        
        if i == 0:
            start_key = item_index // 155

        for char_offset in range(len(item)):
            char_index = item_index + char_offset
            if char_index // 155 != start_key:
                
            
        
        i+=1
        
        
values.mapPartitionsWithIndex(customPartitioningFunction).take(5)


ATCGTGCGCTAGCATCATGCTAGCTACGTAGCTAGCTAGCTAGCTAGCATCGTAGCTGC
|.......... cluster 1......||..........cluster2...........|
|..item0.||.item1...||item2||item0.||item1..||item2||item3|

|.. Cluster offser: 0......||.. Cluster offset: 29 .......|


Intermediate:
|........||........||......||||........||........||.......|
We can do this, as we know the cluster offset, item offset, and base offset.
We can then simply compute which index the items should be in, and give it as the key. We then reduce keys by simply appending all items. (We have to make sure it happens in the correct order)

Target size: 10
|.........||........||........||........||........||......|


In [24]:
import logging
logger = logging.getLogger("pyspark")

In [25]:
import io

def createPartitioner(values, item_size, new_item_size):
    start_indexes = values.getPartitionItemStartIndex()
    
    def partitioner(cluster_index, item_iterator):
        cluster_offset = start_indexes[cluster_index]
        print("IT*S DEFINITIVELY RINNNN===============================------------------------")
        print("Got 1 =================================================")
        
        builder = io.StringIO()
        values = []
        
        start_index = cluster_offset
        current_index = start_index // new_item_size
        
        print("Got 2 =================================================")
        
        for i, item in enumerate(item_iterator):
            item_offset = cluster_offset + i * item_size
            
            for j, char in enumerate(item):
                
                print("Got 3 (edited) =================================================")
                print(f"item_offset: {type(item_offset)}, j: {type(j)}, newitemsize: {type(new_item_size)}, ci: {type(current_index)}")
                
                char_offset = item_offset + j
                
                if char_offset // new_item_size != current_index:
                    print("Got 4 =================================================")
                    values.append((current_index, start_index, builder.getvalue()))
                    builder.truncate(0)
                    builder.seek(0)
                    start_index = char_offset
                    current_index = start_index // new_item_size
                    print("Got 5 =================================================")
                    
                builder.write(char)
                    
        print("Got 6 =================================================")
        remains = builder.getvalue()
        if len(remains) > 0:
            values.append((current_index, start_index, builder.getvalue()))
            
        print("Got 7 =================================================")
        print(f"Values: {values}")
        return values
            
    return partitioner

In [26]:
values.take(5)

[(0, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (1, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (2, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (3, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (4, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN')]

In [27]:
partitioner = createPartitioner(values, 60, 1000)
resized = values.mapPartitionsWithIndex(partitioner)
resized.take(5)

2022-03-24 11:08:24,115 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 15.0 (TID 85) (datanode1 executor 5): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1646237260817_0017/container_1646237260817_0017_01_000006/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1646237260817_0017/container_1646237260817_0017_01_000006/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/tmp/ipykernel_196681/2541553547.py", line 38, in partitioner
TypeError: string argument expected, got 'int'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.s

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 4 times, most recent failure: Lost task 0.3 in stage 15.0 (TID 88) (datanode1 executor 5): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1646237260817_0017/container_1646237260817_0017_01_000006/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1646237260817_0017/container_1646237260817_0017_01_000006/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/tmp/ipykernel_196681/2541553547.py", line 38, in partitioner
TypeError: string argument expected, got 'int'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1646237260817_0017/container_1646237260817_0017_01_000006/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/tmp/hadoop-ubuntu/nm-local-dir/usercache/ubuntu/appcache/application_1646237260817_0017/container_1646237260817_0017_01_000006/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
  File "/tmp/ipykernel_196681/2541553547.py", line 38, in partitioner
TypeError: string argument expected, got 'int'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [28]:
b = io.StringIO()
test = "hello"
for i, v in enumerate(test):
    b.write(v)

In [29]:
b.getvalue()

'hello'

In [30]:
start_indexes = values.getPartitionItemStartIndex()
values.take(5)

                                                                                

[(0, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (1, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (2, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (3, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (4, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN')]

In [31]:
def prefixWithIndex(self):
    starts = [0]
    nums = filtered.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
    for i in range(len(nums) - 1):
        starts.append(starts[-1] + nums[i])

    def func(partition_index, iterator_v):
        for i, v in enumerate(iterator_v, starts[partition_index]):
            yield i, v

    return filtered.mapPartitionsWithIndex(func).mapValues(lambda x: x.value)

pyspark.rdd.RDD.prefixWithIndex = prefixWithIndex

In [32]:
prefixed = filtered.prefixWithIndex()
prefixed.take(5)

                                                                                

[(0, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (1, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (2, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (3, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (4, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN')]

In [33]:
atomized = prefixed.flatMap(lambda x: [(k + x[0], v) for k, v in enumerate(x[1])])
atomized.take(5)

[(0, 'N'), (1, 'N'), (2, 'N'), (3, 'N'), (4, 'N')]

In [34]:
discritized = atomized.map(lambda x: (x[0] // 16, x[1]))
discritized.take(3)

[(0, 'N'), (0, 'N'), (0, 'N')]

In [35]:
unified = discritized.reduceByKey(lambda a0, a1: f"{a0}{a1}")
unified.take(1)

                                                                                

[(23771,
  'TTCCATGCCTTGCCTATGCGGGCCCAGTTTTGAAAACTAGAGATGCTGCCTTGGCTATGTAATTCTGCACCAAACACCTGACATATTTCTGTTCTATAGAAAGATGCTTGGACCTTGCCCTGTGACTCCTTTCTGGCCAAAGTGATGAAACTCTTTGCCCAGCTCATATGCTATCTCTCACAGAACAAAGGTTGTAGGTGACAGTGTCATCACAGGGCCAACCATTAGGATTTCTCACACTCAGATGCCACAGTTGAGATGGTTATGTAAACCCAGTTACTACAGTCATTGGTCCAATCATATAAATTCATCAAATTGTGACACACATAATACAGCCCTCAGTTATTGACTCTCATACATCACACCTTAGAGGCTGCCCATGCAAGGGTCCACAGGAGATGTTGATTCTATGGACTTTCTTGCATTACTTTCCTGTATGAACCTAGGTGATGCAATTGTATCACTGAACCCAATGCCATTGTGCTATACTCTGCCTACAGAGGGCTTCTCCTGCATAGTCCTAGCTGATATGATTCTAGGCTCAACAGCAAGGTGATGTATTGCTGAGACAGGAGGCAATGTGATTGCCTGGGCTTTTAGACTGTCCTCTTCTGCCAGCACATAGGTGATGTATCTTTAGGCCCATCAAGGACTGTGATATGTCCCCCTGCCATCTAGGGTCTTTTTTACTTTGTGGACGTACAATATGTAATGCTGAACCCACCATCTGGGACTGTGATGTATCAGGCCCTGCATACATGACTCTTCTGCCTGGATGTGTTATGTGACTCTTCACTGGACCAAGCACTTTAGTATTCTGATACGCCATGCTCACAGTGGCTTTTCTGCCTGCACAGGTGGTGTGTCTCACTCAGCACCCAAACATATCCCTGGAATATTATAATATCCACAGGTAATACCTCAGGTAAGATATGGTATCGTGATATGAACCAATCCGGT')]

In [36]:
atomized.reduceByKey(lambda a0, a1: f"{a0}{a1}").take(2)

                                                                                

[(570394, 'TAAGATAAATGGCACCGTATAACCTGCAGATTGTATTGGTCTTAGTAAACTCCTCGTCCT'),
 (570405, 'TTTCTGTACATTGACCATATCCAGAATGGTTGTTCCATTCCTTGATTTACTCTCACGGCC')]

In [37]:
atomized.filter(lambda x: x[0] == 915008).collect()

                                                                                

[(915008, 'A'),
 (915008, 'T'),
 (915008, 'T'),
 (915008, 'T'),
 (915008, 'T'),
 (915008, 'G'),
 (915008, 'C'),
 (915008, 'C'),
 (915008, 'A'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'T'),
 (915008, 'A'),
 (915008, 'T'),
 (915008, 'A'),
 (915008, 'C'),
 (915008, 'A'),
 (915008, 'C'),
 (915008, 'G'),
 (915008, 'G'),
 (915008, 'A'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'G'),
 (915008, 'T'),
 (915008, 'C'),
 (915008, 'C'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'G'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'C'),
 (915008, 'A'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'T'),
 (915008, 'T'),
 (915008, 'A'),
 (915008, 'T'),
 (915008, 'G'),
 (915008, 'A'),
 (915008, 'A'),
 (915008, 'A'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'T'),
 (915008, 'C'),
 (915008, 'G'),
 (915008, 'T'),
 (915008, 'G'),
 (915008, 'G'),
 (915008, 'C'),
 (915008, 'A'),
 (915008, 'C'),
 (915008, 'A'),
 (915008, 'G'),
 (915008, 'T'),
 (915008, 'G'),
 (915008, 'A')]

In [47]:
values.groupByKey().map(lambda x: (x[0], sum([1 for e in x[1]]))).filter(lambda x: x[1] > 0).take(5)

                                                                                

[(95073, 1), (95084, 1), (95095, 1), (95106, 1), (95117, 1)]

In [54]:
kr = values.flatMap(lambda x: [(x[0]*60 + i, (v, 1)) for i, v in enumerate(x[1])])#.reduceByKey(lambda a0, a1: (a0[0], a0[1] + a1[1]))
kr.take(5)

TypeError: '<' not supported between instances of 'int' and 'JavaList'

In [13]:
keyed = filtered.map(lambda x: x.value).zipWithIndex().map(lambda x: (x[1], x[0]))
keyed.take(5)

                                                                                

[(0, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (1, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (2, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (3, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (4, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN')]

In [18]:
atomized = keyed.flatMap(lambda x: [(i + x[0] * 60, v) for i, v in enumerate(x[1])])
atomized.take(5)

[(0, 'N'), (1, 'N'), (2, 'N'), (3, 'N'), (4, 'N')]

In [24]:
decimated = atomized.map(lambda x: (x[0] // 250, (x[0], x[1])))
decimated.take(2)

[(0, (0, 'N')), (0, (1, 'N'))]

In [27]:
grouped = decimated.groupByKey()
grouped.take(2)

                                                                                

[(627, <pyspark.resultiterable.ResultIterable at 0x7fe249b69d00>),
 (2068, <pyspark.resultiterable.ResultIterable at 0x7fe249b69f10>)]

In [31]:
91630 * 250

22907500

In [32]:
def mapper(x):
    values = list(x[1])
    chars = [v[1] for v in values]
    
    return (x[0], "".join(chars))

mapped = grouped.map(mapper)
mapped.take(2)

                                                                                

[(207955,
  'GCCTCCAGCCTCATTACATTCTCAAACACAGACAAGCCCCGTCTTCCTCTAAATAAAATTGCCTTCAAATTCAATTTAAAACTTACCACAGAGGCACTCCTTTGACACTCCTCCCCCTAGACCTTTCATTCACGCCACTCCTCTTCCTAGCTCGGGCTTCTCTGTATCTGAGCCCCATATTAAATTGCAAATCTGAGCAAAAAACAGCATTCACTGAGACACACTTTCCCCAGTTTTCTATTACTGAGAC'),
 (206547,
  'CTGGGAGCAACTGAGGAGGTGACATTGATCTTGGGAATTCTATTTTTTATTTTCACCCTCTCTTTGTGATCGATCACTAGAGAACAGTTACCACGTGCCTGGCTGCAGTGAGTTTTAGGATTCCAGGATCCCAGGTGTAGGAGAATGTACAAAACATTAGGTCACTTGTTTCCTTGGGTGAGAGACAGAGATACCTGTAGGGCCAGCCCCAAACCCCACCTGGTCCTCTCAAGCATTTGTCCCACTCAAC')]

In [49]:
import pyspark.rdd

def resizeByIndex(self, lambda_selector):
    keyed = self.zipWithIndex().map(lambda x: (x[1], x[0]))
    
    sizes = self.map(lambda x: len(x[0]))
    elementSize = sizes.first()
    
    lastElementSize = keyed.sortByKey(False).map(lambda x: len(x[1])).first()
    elementCount = sizes.filter(lambda x: x != elementSize).count()
    
    print(f"ec: {elementCount}, les: {lastElementSize}, es: {elementSize}")
    
    if  not (elementCount == 0 or (elementCount == 1 and lastElementSize != elementSize)):
        raise ValueError(f"Not all elements had equal length. Elements must have equal length for this method to work.")
    
    atomized = keyed.flatMap(lambda x: [(i + x[0] * elementSize, v) for i, v in enumerate(x[1])])
    decimated = atomized.map(lambda x: (lambda_selector(x[0]), (x[0], x[1])))
    grouped = decimated.groupByKey()
    
    def mapper(x):
        values = list(x[1])
        chars = [v[1] for v in values]

        return (x[0], chars)

    mapped = grouped.map(mapper)
    return mapped

pyspark.rdd.RDD.resizeByIndex = resizeByIndex

In [54]:
filtered.map(lambda x: x.value).resizeByIndex(lambda x: x // 20).take(5)

                                                                                

ec: 0, les: 16, es: 1


                                                                                

[(26477,
  ['A',
   'A',
   'T',
   'C',
   'T',
   'A',
   'G',
   'C',
   'A',
   'T',
   'G',
   'C',
   'T',
   'C',
   'A',
   'C',
   'T',
   'G',
   'G',
   'A',
   'G',
   'T',
   'A',
   'T',
   'G',
   'T',
   'C',
   'A',
   'T',
   'T',
   'G',
   'C',
   'T',
   'A',
   'A',
   'G',
   'T',
   'G',
   'C',
   'T',
   'T',
   'T',
   'C',
   'T',
   'C',
   'C',
   'T',
   'T',
   'C',
   'C',
   'C',
   'T',
   'G',
   'T',
   'G',
   'T',
   'A',
   'C',
   'A',
   'G',
   'G',
   'A',
   'G',
   'C',
   'T',
   'G',
   'A',
   'A',
   'A',
   'T',
   'C',
   'T',
   'C',
   'C',
   'A',
   'A',
   'A',
   'C',
   'T',
   'G',
   'T',
   'G',
   'A',
   'A',
   'A',
   'A',
   'C',
   'C',
   'C',
   'T',
   'G',
   'T',
   'G',
   'G',
   'T',
   'C',
   'T',
   'T',
   'C',
   'C',
   'C',
   'G',
   'G',
   'G',
   'G',
   'T',
   'C',
   'T',
   'G',
   'G',
   'T',
   'T',
   'G',
   'A',
   'T',
   'T',
   'T',
   'C',
   'A',
   'T',
   'T',
   'T',
   'T',
   'T',

## Current SOTA

In [6]:
raw = filtered.map(lambda x: x.value)
raw.take(5)

                                                                                

['NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN',
 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN',
 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN',
 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN',
 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN']

In [7]:
indexed = raw.zipWithIndex().map(lambda x: (x[1], x[0]))
indexed.take(5)

                                                                                

[(0, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (1, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (2, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (3, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'),
 (4, 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN')]

In [8]:
atomized = indexed.flatMap(lambda x: [(x[0] * 60 + i, v) for i, v in enumerate(x[1])])
atomized.take(5)

[(0, 'N'), (1, 'N'), (2, 'N'), (3, 'N'), (4, 'N')]

In [9]:
keyed = atomized.map(lambda x: (x[0] // 10, (x[0], x[1])))
keyed.take(5)

[(0, (0, 'N')), (0, (1, 'N')), (0, (2, 'N')), (0, (3, 'N')), (0, (4, 'N'))]

In [10]:
grouped = keyed.groupByKey()
grouped.take(5)

                                                                                

[(401489, <pyspark.resultiterable.ResultIterable at 0x7f189e1c4820>),
 (401500, <pyspark.resultiterable.ResultIterable at 0x7f189e1c4370>),
 (401511, <pyspark.resultiterable.ResultIterable at 0x7f189e1c4670>),
 (401522, <pyspark.resultiterable.ResultIterable at 0x7f189e1c46d0>),
 (401533, <pyspark.resultiterable.ResultIterable at 0x7f189e1c4490>)]

In [11]:
grouped.mapValues(list).take(1)

                                                                                

[(114994,
  [(1149940, 'A'),
   (1149941, 'G'),
   (1149942, 'A'),
   (1149943, 'C'),
   (1149944, 'A'),
   (1149945, 'T'),
   (1149946, 'T'),
   (1149947, 'T'),
   (1149948, 'G'),
   (1149949, 'T')])]

In [12]:
stringified = grouped.mapValues(lambda x: [i[1] for i in x]).mapValues(lambda x: "".join(x))
stringified.take(5)

                                                                                

[(457908, 'CTCCGCCTCC'),
 (457919, 'ACTCACCCGC'),
 (457930, 'AAATGCCAGC'),
 (457941, 'ATCCAGGCAT'),
 (457952, 'ACCCAACTAA')]

-------------------------------------------
|------------------||---------------------|
|----||------------------||---------------|
++++++++++++++      +++++++++++++++++
      ++++++++++++++      +++++++++++

In [12]:
def window(x):
    s = x[1]
    
    windows = []
    window_size = 4
    for i in range(len(s) - window_size):
        windows.append((x[0] + i, s[i:i+window_size]))
        
    return windows

stringified.flatMap(window).take(5)

[(457908, 'CTCC'),
 (457909, 'TCCG'),
 (457910, 'CCGC'),
 (457911, 'CGCC'),
 (457912, 'GCCT')]

In [13]:
def preprocess(values, segmentLength):
    raw = values.map(lambda x: x.value)
    indexed = raw.zipWithIndex().map(lambda x: (x[1], x[0]))
    atomized = indexed.flatMap(lambda x: [(x[0] * segmentLength + i, v) for i, v in enumerate(x[1])])
    return atomized

def resize(atomized, newSize, offset = 0):
    keyed = atomized.map(lambda x: (((x[0] + offset) // newSize) * newSize - offset, (x[0], x[1])))
    grouped = keyed.groupByKey()
    stringified = grouped.mapValues(lambda x: [i[1] for i in x]).mapValues(lambda x: "".join(x))
    return stringified

def slidingWindow(stringified, window_size):
    def window(x):
        s = x[1]
        windows = []
        for i in range(len(s) - window_size):
            windows.append((x[0] + i, s[i:i+window_size]))
            
        return windows

    return stringified.flatMap(window)

In [14]:
pre = preprocess(filtered, 60)
pre.take(5)

                                                                                

[(0, 'N'), (1, 'N'), (2, 'N'), (3, 'N'), (4, 'N')]

In [29]:
pregrouped = pre.groupByKey()
pregrouped.take(5)

                                                                                

[(4014846, <pyspark.resultiterable.ResultIterable at 0x7f189e0dab50>),
 (4014857, <pyspark.resultiterable.ResultIterable at 0x7f189e0da2b0>),
 (4014868, <pyspark.resultiterable.ResultIterable at 0x7f189e0dabb0>),
 (4014879, <pyspark.resultiterable.ResultIterable at 0x7f189e0dae50>),
 (4014890, <pyspark.resultiterable.ResultIterable at 0x7f189e0dad90>)]

In [30]:
# Validated that pre-filtering (atomizing) is correct
filt = pregrouped.map(lambda x: (x[0], len(list(x[1])))).filter(lambda x: x[1] != 1)
filt.take(5)

                                                                                

[]

In [15]:
window_offset_0 = resize(pre, 1024, 0)
window_0 = slidingWindow(window_offset_0, 155)

window_offset_1 = resize(pre, 1024, 155)
window_1 = slidingWindow(window_offset_1, 155)

windowed = window_0.union(window_1)
windowed.take(1)

                                                                                

[(4021248,
  'GCAGAACTTTGGAAGTAGAAAATTTTAAACTATCGAATCATGCACCTTAAGGAACTACAGAGCAAGAACAAATGAATAACTATACACTAACAAACTGGAAAACCTAGGGGAAATGGATAAATTCCTGGATACTAAGACCAATAATGATTCAAATA')]

In [16]:
windowed.distinct().count()

                                                                                

5888886

In [17]:
pre.lookup(588259)

                                                                                

['G']

In [18]:
windowed.lookup(58860)

                                                                                

['GTCACGGGTTCCGCTGAGCTCAGGGCAGCTGCGGCCGGGCGGGGCTGGGGCGGAAGTGAGGCTGGAGCTGCCGGGGGGCACCGTCCTGGGGACAGCAGGGCCTGGCACCCCCGCCCAGCAACAAAAAATTAAAAAATTAGCAGGGCAGGGTGGTG',
 'GTCACGGGTTCCGCTGAGCTCAGGGCAGCTGCGGCCGGGCGGGGCTGGGGCGGAAGTGAGGCTGGAGCTGCCGGGGGGCACCGTCCTGGGGACAGCAGGGCCTGGCACCCCCGCCCAGCAACAAAAAATTAAAAAATTAGCAGGGCAGGGTGGTG']

In [19]:
wg = windowed.groupByKey()
wg.take(5)

                                                                                

[(3446784, <pyspark.resultiterable.ResultIterable at 0x7f189e0de4f0>),
 (3446806, <pyspark.resultiterable.ResultIterable at 0x7f189e0de5e0>),
 (3446828, <pyspark.resultiterable.ResultIterable at 0x7f189e0de640>),
 (3446850, <pyspark.resultiterable.ResultIterable at 0x7f189e0de700>),
 (3446872, <pyspark.resultiterable.ResultIterable at 0x7f189e0de730>)]

In [27]:
windowed.distinct().lookup(90498)

                                                                                

['CAGGCGCCCACCACCACACCCAGCTTATTTTTGTATTTTTAGGATACACTTTTGGTTCCAGAAGTAACACTTGACACTCTTCTTGCAGAGAGGAAGAAAATGACGTTCCCCTCGTGGCGCTGGACACCCCCACGTCCACACACTTCCCTCGGTCA']

In [24]:
windowed.distinct().lookup(4014284)

                                                                                

['CACCTAGGAATAAACGAAAGAAATGAAAGATCTCTAGCAACCAAAGAAAAATGGACAAATGGGCACTCACTCTGTGCCTGGCCTTGAGCTGGGAGATCCTAGAAGCACAGGCAGAAAAGTCAAGGCCAGGATCAGAAAGGCCCAGGCAGAGGGGT',
 'TGCCCAGCCAATCTCCAGCTCTGCACCTTTGTCAAGTCACTGACCTCTGTGAGGATCTGACGGAAAAAAAAATGAACCCGTGTAGCTGGAATTACAGGCATGCGCCACCACGCCCAGCTAATTTTTGTACTCTTACAGTGGCTGTAGGCGCGCAC']

In [21]:
re_atomized = windowed.distinct().map(lambda x: (x[0], x[1][0]))
re_atomized.take(5)

                                                                                

[(2874373, 'T'),
 (2874382, 'C'),
 (2874415, 'T'),
 (2874428, 'C'),
 (2874444, 'T')]

In [23]:
re_atomized.subtract(pre).take(50)

                                                                                

[(4014284, 'C'),
 (4014284, 'C'),
 (5160782, 'T'),
 (4578789, 'C'),
 (5160151, 'A'),
 (1149271, 'G'),
 (4014729, 'G'),
 (4014729, 'G'),
 (5160208, 'G'),
 (4014469, 'C'),
 (4014469, 'C'),
 (5160676, 'G'),
 (4014799, 'G'),
 (4014807, 'T'),
 (4014885, 'G'),
 (4578449, 'G'),
 (1148992, 'C'),
 (1149102, 'T'),
 (1149253, 'C'),
 (4014253, 'G'),
 (4014607, 'A'),
 (5160040, 'G'),
 (1724356, 'A'),
 (1724838, 'A'),
 (4014202, 'G'),
 (1149145, 'T'),
 (4014301, 'A'),
 (1724380, 'G'),
 (5160531, 'A'),
 (5160261, 'T'),
 (4578297, 'G'),
 (5160266, 'G'),
 (1725048, 'A'),
 (5160672, 'G'),
 (1149074, 'T'),
 (4014709, 'A'),
 (1724432, 'G'),
 (5159815, 'A'),
 (5159942, 'G'),
 (5159942, 'G'),
 (4014214, 'T'),
 (1149538, 'A'),
 (5160369, 'C'),
 (5160369, 'C'),
 (4014710, 'A'),
 (4014710, 'A'),
 (1724454, 'A'),
 (1724578, 'G'),
 (1725110, 'C'),
 (5160188, 'T')]

In [65]:
windowed.groupByKey().mapValues(lambda x: len(list(x))).map(lambda x: (x[1], x[0])).filter(lambda x: x[0] == 1738).sortByKey(True).take(5)

                                                                                

[]

In [34]:
gr = windowed.groupByKey().mapValues(lambda x: len(list(x))).distinct()
gr.take(5)

                                                                                

[(23570, 2), (23658, 2), (23702, 2), (23790, 2), (23856, 2)]

In [23]:
gr.map(lambda x: (x[1], x[0])).sortByKey(True).take(5)

[(1, 0), (1, 6758), (2, 1), (2, 6757), (3, 6756)]

In [30]:
windowed.map(lambda x: (x[0], x[1][0])).take(5)

[(2244, 'A'), (2245, 'C'), (2246, 'C'), (2247, 'T'), (2248, 'A')]

In [39]:
atomized.sortByKey(False).take(20)

                                                                                

[(5874059, 'N'),
 (5874058, 'N'),
 (5874057, 'N'),
 (5874056, 'N'),
 (5874055, 'N'),
 (5874054, 'N'),
 (5874053, 'N'),
 (5874052, 'N'),
 (5874051, 'N'),
 (5874050, 'N'),
 (5874049, 'N'),
 (5874048, 'N'),
 (5874047, 'N'),
 (5874046, 'N'),
 (5874045, 'N'),
 (5874044, 'N'),
 (5874043, 'N'),
 (5874042, 'N'),
 (5874041, 'N'),
 (5874040, 'N')]

In [38]:
testing = windowed.map(lambda x: (x[0], x[1][0]))
testing.sortByKey(False).take(20)

                                                                                

[(6758, 'N'),
 (6757, 'G'),
 (6757, 'N'),
 (6756, 'T'),
 (6756, 'A'),
 (6756, 'N'),
 (6755, 'G'),
 (6755, 'T'),
 (6755, 'C'),
 (6755, 'N'),
 (6754, 'G'),
 (6754, 'A'),
 (6754, 'T'),
 (6754, 'G'),
 (6754, 'N'),
 (6753, 'A'),
 (6753, 'T'),
 (6753, 'C'),
 (6753, 'G'),
 (6753, 'G')]

In [40]:
filtered.count()

                                                                                

97901