In [1]:
import findspark
findspark.init("/opt/spark/spark-2.4.0-bin-hadoop2.7/")

import pyspark


sc = pyspark.SparkContext(
    appName="hse",
    master="spark://95.216.186.128:7414"
)


In [2]:
sc.parallelize(range(100)).sum()

4950

In [3]:
sql = pyspark.sql.HiveContext(sc)

In [4]:
alexa = sc.textFile("/root/data/top-1m.csv").map(lambda x: tuple(x.split(",")))

In [5]:
alexa.take(10)

[('1', 'google.com'),
 ('2', 'youtube.com'),
 ('3', 'facebook.com'),
 ('4', 'baidu.com'),
 ('5', 'wikipedia.org'),
 ('6', 'qq.com'),
 ('7', 'taobao.com'),
 ('8', 'yahoo.com'),
 ('9', 'tmall.com'),
 ('10', 'amazon.com')]

In [6]:
original = alexa.zipWithIndex().map(lambda x: x[::-1])
minus_one = original.map(lambda x: (x[0] -1 , x[1]))

In [7]:
original.take(5)

[(0, ('1', 'google.com')),
 (1, ('2', 'youtube.com')),
 (2, ('3', 'facebook.com')),
 (3, ('4', 'baidu.com')),
 (4, ('5', 'wikipedia.org'))]

In [20]:
minus_one.take(5)

[(-1, 'google.com'),
 (0, 'youtube.com'),
 (1, 'facebook.com'),
 (2, 'baidu.com'),
 (3, 'wikipedia.org')]

In [8]:
original.join(minus_one).take(5)

[(524288, (('524289', 'polypet.com.sg'), ('524290', 'praclox.nl'))),
 (0, (('1', 'google.com'), ('2', 'youtube.com'))),
 (524292, (('524293', 'rctankwarfare.co.uk'), ('524294', 'riool.info'))),
 (436452, (('436453', 'cboe.org'), ('436454', 'ccbox24.com'))),
 (524296, (('524297', 'schmiedmann.nl'), ('524298', 'seat.nl')))]

In [9]:
def pairs(part):
    prev = next(part)
    for element in part:
        yield prev, element
        prev = element

list(pairs(iter(range(10))))

[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9)]

In [10]:
alexa.mapPartitions(lambda x: [len(list(x))]).collect()

[519261, 480739]

In [11]:
alexa.mapPartitions(pairs).take(10)

[(('1', 'google.com'), ('2', 'youtube.com')),
 (('2', 'youtube.com'), ('3', 'facebook.com')),
 (('3', 'facebook.com'), ('4', 'baidu.com')),
 (('4', 'baidu.com'), ('5', 'wikipedia.org')),
 (('5', 'wikipedia.org'), ('6', 'qq.com')),
 (('6', 'qq.com'), ('7', 'taobao.com')),
 (('7', 'taobao.com'), ('8', 'yahoo.com')),
 (('8', 'yahoo.com'), ('9', 'tmall.com')),
 (('9', 'tmall.com'), ('10', 'amazon.com')),
 (('10', 'amazon.com'), ('11', 'twitter.com'))]

In [12]:
alexa_pairs = alexa.mapPartitions(pairs)
alexa_pairs.count()

999998

In [13]:
def first_last(part):
    yield next(part)
    for el in part:
        pass
    yield el

def drop_edge_cases(part):
    return list(part)[1:-1]

drop_edge_cases(range(5))

[1, 2, 3]

In [14]:
def slicepart(part):
    prev = None
    for i in part:
        if prev is None:
            prev = i
        else:
            yield prev, i
            prev = None

list(slicepart(iter(range(10))))

[(0, 1), (2, 3), (4, 5), (6, 7), (8, 9)]

In [15]:
missing_pairs = (
    alexa
    .mapPartitions(first_last)
    .coalesce(1)
    .mapPartitions(drop_edge_cases)
    .mapPartitions(slicepart)
)

missing_pairs.collect()

[(('519261', 'radiosuperpopayan.com'), ('519262', 'saladeprensainexmoda.com'))]

In [16]:
final = alexa_pairs.union(missing_pairs)

In [17]:
final.count()

999999

In [18]:
final.distinct().count()

999999