In [28]:
# Recipe 7-1. Optimize the Page-Rank Algorithm by Using PySpark Code
pageLinks = [['a' ,['b','c','d']],
['b', ['d','c']],
['c', ['b']],
['d', ['a','c']]]
pageRanks = [['a',1],
['b',1],
['c',1],
['d',1]]
numIter = 20

In [29]:
pageRanksRDD = sc.parallelize(pageRanks, 2).partitionBy(2,hash).persist()
pageLinksRDD = sc.parallelize(pageLinks, 2).partitionBy(2,hash).persist()

In [30]:
s = 0.85

In [31]:
def rankContribution(uris, rank):
    numberOfUris = len(uris)
    rankContribution = float(rank) / numberOfUris
    newRank = []
    for uri in uris:
        newRank.append((uri, rankContribution))
    return newRank

In [32]:
linksRank = pageLinksRDD.join(pageRanksRDD)
linksRank.collect()

[('b', (['d', 'c'], 1)),
 ('c', (['b'], 1)),
 ('d', (['a', 'c'], 1)),
 ('a', (['b', 'c', 'd'], 1))]

In [33]:
contributedRDD = linksRank.flatMap(lambda x: rankContribution(x[1][0], x[1][1]))
contributedRDD.collect()

[('d', 0.5),
 ('c', 0.5),
 ('b', 1.0),
 ('a', 0.5),
 ('c', 0.5),
 ('b', 0.3333333333333333),
 ('c', 0.3333333333333333),
 ('d', 0.3333333333333333)]

In [34]:
sumRanks = contributedRDD.reduceByKey(lambda v1, v2: v1+v2)
sumRanks.collect()

[('d', 0.8333333333333333),
 ('c', 1.3333333333333333),
 ('b', 1.3333333333333333),
 ('a', 0.5)]

In [35]:
pageRanksRDD = sumRanks.map(lambda x: (x[0], 1-s+s*x[1]))
pageRanksRDD.collect()

[('d', 0.8583333333333333),
 ('c', 1.2833333333333332),
 ('b', 1.2833333333333332),
 ('a', 0.575)]

In [36]:
for i in range(numIter):
    linksRank = pageLinksRDD.join(pageRanksRDD)
    contributedRDD = linksRank.flatMap(lambda x: rankContribution(x[1][0], x[1][1]))
    sumRanks = contributedRDD.reduceByKey(lambda v1, v2: v1+v2)
    pageRanksRDD = sumRanks.map(lambda x: (x[0], 1-s+s*x[1]))

In [37]:
pageRanksRDD.collect()

[('a', 0.521726802480915),
 ('b', 1.3572439811068668),
 ('c', 1.246378009446567),
 ('d', 0.874651206965652)]

In [38]:
# Recipe 7-2. Implement the k-Nearest Neighbors Algorithm by Using PySpark

In [42]:
def distanceBetweenTuples(data1, data2):
    squaredSum = 0.0
    for i in range(len(data1)):
        squaredSum = squaredSum + (data1[i] - data2[i])**2
    return squaredSum**0.5

In [43]:
t1 = (1.2, 3.4, 3.2)
t2 = (2.4, 2.2, 4.2)

In [44]:
distanceBetweenTuples(t1, t2)

1.9697715603592207

In [45]:
knnDataList = [((3.09,1.97,3.73),'group1'),
((2.96,2.15,4.16),'group1'),
((2.87,1.93,4.39),'group1'),
((3.02,1.55,4.43),'group1'),
((1.80,3.65,2.08),'group2'),
((1.36,4.43,1.95),'group2'),
((1.71,4.35,1.94),'group2'),
((1.03,3.75,2.12),'group2'),
((2.30,3.59,1.99),'group2')]

In [46]:
knnDataRDD = sc.parallelize(knnDataList, 4)

In [47]:
newRecord = [(2.5, 1.7, 4.2)]

In [48]:
newRecordRDD = sc.parallelize(newRecord, 1)

In [49]:
cartesianDataRDD = knnDataRDD.cartesian(newRecordRDD)

In [51]:
cartesianDataRDD.collect()

[(((3.09, 1.97, 3.73), 'group1'), (2.5, 1.7, 4.2)),
 (((2.96, 2.15, 4.16), 'group1'), (2.5, 1.7, 4.2)),
 (((2.87, 1.93, 4.39), 'group1'), (2.5, 1.7, 4.2)),
 (((3.02, 1.55, 4.43), 'group1'), (2.5, 1.7, 4.2)),
 (((1.8, 3.65, 2.08), 'group2'), (2.5, 1.7, 4.2)),
 (((1.36, 4.43, 1.95), 'group2'), (2.5, 1.7, 4.2)),
 (((1.71, 4.35, 1.94), 'group2'), (2.5, 1.7, 4.2)),
 (((1.03, 3.75, 2.12), 'group2'), (2.5, 1.7, 4.2)),
 (((2.3, 3.59, 1.99), 'group2'), (2.5, 1.7, 4.2))]

In [52]:
k = 5

In [57]:
groupAndDistanceRDD = cartesianDataRDD.map(lambda x: (x[0][1], distanceBetweenTuples(x[0][0], x[1])))

In [58]:
groupAndDistanceRDD.collect()

[('group1', 0.8011866199581719),
 ('group1', 0.6447480127925947),
 ('group1', 0.47528938553264566),
 ('group1', 0.5880476171195661),
 ('group2', 2.9642705679475347),
 ('group2', 3.71685350800916),
 ('group2', 3.5713022834814754),
 ('group2', 3.269525959523796),
 ('group2', 2.9148241799463652)]

In [59]:
ourClasses = groupAndDistanceRDD.takeOrdered(k, key=lambda x: x[1])

In [60]:
ourClasses

[('group1', 0.47528938553264566),
 ('group1', 0.5880476171195661),
 ('group1', 0.6447480127925947),
 ('group1', 0.8011866199581719),
 ('group2', 2.9148241799463652)]

In [61]:
group = [x[0] for x in ourClasses]

In [62]:
group

['group1', 'group1', 'group1', 'group1', 'group2']

In [63]:
max(group, key=group.count)

'group1'

In [1]:
def stringToNumberSum(data):
    removedSpaceData = data.strip()
    if removedSpaceData == '':
        return None
    splittedData = removedSpaceData.split(' ')
    numData = [float(x) for x in splittedData]
    sumOfData = sum(numData)
    return sumOfData

In [3]:
from pyspark.streaming import StreamingContext
ncSc = StreamingContext(sc, 10)

In [4]:
consoleStreamingData = ncSc.socketTextStream(hostname='localhost', port=55342)

In [5]:
sumedData = consoleStreamingData.map(stringToNumberSum)

In [6]:
sumedData.pprint()

In [7]:
ncSc.start()

In [8]:
ncSc.awaitTerminationOrTimeout(30)

-------------------------------------------
Time: 2018-11-08 22:31:40
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 22:31:50
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 22:32:00
-------------------------------------------



False

-------------------------------------------
Time: 2018-11-08 22:32:10
-------------------------------------------
110.0

-------------------------------------------
Time: 2018-11-08 22:32:20
-------------------------------------------
114.0

-------------------------------------------
Time: 2018-11-08 22:32:30
-------------------------------------------
158.0

-------------------------------------------
Time: 2018-11-08 22:32:40
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 22:32:50
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 22:33:00
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 22:33:10
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 22:33:20
-------------------------------------------

-------------------------------------------
Time: 2018-11-08 2

In [9]:
ncSc.stop()

In [10]:
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext

In [11]:
booStreamContext = StreamingContext(sc, 10)

Py4JJavaError: An error occurred while calling None.org.apache.spark.streaming.api.java.JavaStreamingContext.
: java.lang.NullPointerException
	at org.apache.spark.streaming.api.java.JavaStreamingContext.<init>(JavaStreamingContext.scala:130)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
