In [1]:
import findspark
findspark.init('/home/seanzhen52/spark-2.2.3-bin-hadoop2.7/')

In [2]:
from pyspark import SparkContext
sc = SparkContext('local')

In [3]:
#列表创建
nums = [1,2,3,4,5,6]
rdd = sc.parallelize(nums)

In [5]:
rdd.count()

6

RDD被创建好后，之后一般会发生两种操作：
- 转换：基于现有的数据集创建一个新的数据集
- 行动：在数据集上进行运算，返回计算值

In [11]:
ls

[0m[01;34manaconda3[0m/                       [01;34mspark-2.2.3-bin-hadoop2.7[0m/
Anaconda3-5.0.0-Linux-x86_64.sh  [01;31mspark-2.2.3-bin-hadoop2.7.tgz[0m
donald.json                      [01;34mspark-warehouse[0m/
first pyspark.ipynb              test.py
hillary.json                     tweets.json
hs_err_pid2511.log               tweets.json.1
hs_err_pid2996.log               [01;34mword2vecM_simple[0m/
hs_err_pid3589.log               [01;31mword2vecM_simple.zip[0m
hs_err_pid3741.log               WordCount.ipynb
hs_err_pid7525.log               word.txt
PysparkPratice-1.ipynb           [01;34mwriteback[0m/


In [7]:
#惰性机制
lines = sc.textFile('word.txt')

In [12]:
lineLengths = lines.map(lambda line:len(line))

In [20]:
#collect():行动，以list的形式返回数据集中的所有元素
lineLengths.collect()

[11, 11, 12, 21, 3, 15]

In [13]:
# “行动”操作才会触发真正的极算
totalLength = lineLengths.reduce(lambda a,b:a+b)

In [14]:
totalLength

73

In [15]:
cat word.txt

how are you
hello world
how dare you
fuck you motherfucker
lol
read me please.


In [16]:
#filter():转换操作，筛选
#count():返回数据集中的元素个数
lines.filter(lambda line:'how' in line).count()

2

In [17]:
#map():转换，将每个元素传递到函数，返回新的数据集
#reduce():(输入两个参数并返回一个值)，聚合数据集中的元素
lines.map(lambda line:len(line.split(' '))).reduce(lambda a,b:(a>b and a or b))

3

### 缓存

In [22]:
list_1 = ['hadoop','Spark','Hive']
rdd = sc.parallelize(list_1)
print(rdd.count())
print(','.join(rdd.collect()))

3
hadoop,Spark,Hive


上面代码执行过程中，前后共触发了两次从头到尾的计算。可以通过持久化（缓存）机制避免这种重复计算的开销。

In [23]:
rdd.cache()#语句执行到这里，并不会缓存rdd，这是rdd还没有被计算生成

ParallelCollectionRDD[14] at parallelize at PythonRDD.scala:540

In [24]:
#第一次行动操作，触发一次真正从头到尾的计算，这时才会执行上面的rdd.cache()，把这个rdd放到缓存中
print(rdd.count())
print(','.join(rdd.collect()))

3
hadoop,Spark,Hive


In [25]:
rdd.collect()

['hadoop', 'Spark', 'Hive']

In [26]:
rdd.unpersist()#移除缓存

ParallelCollectionRDD[14] at parallelize at PythonRDD.scala:540

### 分区
- 本地模式：默认为本地机器的CPU数目，若设置了local[N],则默认为N；

In [29]:
array = [1,2,3,4,5]
rdd = sc.parallelize(array,2)#设置为两个区

In [30]:
rdd.collect()

[1, 2, 3, 4, 5]

In [33]:
rdd.foreach(print)

### 键值(字典)

In [34]:
pairRDD = lines.flatMap(lambda line:line.split(" ")).map(lambda word:(word,1))

In [36]:
pairRDD.collect()

[('how', 1),
 ('are', 1),
 ('you', 1),
 ('hello', 1),
 ('world', 1),
 ('how', 1),
 ('dare', 1),
 ('you', 1),
 ('fuck', 1),
 ('you', 1),
 ('motherfucker', 1),
 ('lol', 1),
 ('read', 1),
 ('me', 1),
 ('please.', 1)]

In [43]:
pairRDD.reduceByKey(lambda a,b:a+b).collect()

[('how', 2),
 ('are', 1),
 ('you', 3),
 ('hello', 1),
 ('world', 1),
 ('dare', 1),
 ('fuck', 1),
 ('motherfucker', 1),
 ('lol', 1),
 ('read', 1),
 ('me', 1),
 ('please.', 1)]

In [44]:
#对具有相同键的值进行分组。比如，对四个键值对(“spark”,1)、(“spark”,2)、(“hadoop”,3)和(“hadoop”,5)，采用groupByKey()后得到的结果是：(“spark”,(1,2))和(“hadoop”,(3,5))
pairRDD.groupByKey().collect()

[('how', <pyspark.resultiterable.ResultIterable at 0x7fd9614a6b38>),
 ('are', <pyspark.resultiterable.ResultIterable at 0x7fd9614a6128>),
 ('you', <pyspark.resultiterable.ResultIterable at 0x7fd9614a66d8>),
 ('hello', <pyspark.resultiterable.ResultIterable at 0x7fd9614a6e80>),
 ('world', <pyspark.resultiterable.ResultIterable at 0x7fd9614a6e10>),
 ('dare', <pyspark.resultiterable.ResultIterable at 0x7fd9614a65f8>),
 ('fuck', <pyspark.resultiterable.ResultIterable at 0x7fd9614a6208>),
 ('motherfucker', <pyspark.resultiterable.ResultIterable at 0x7fd9614a6c18>),
 ('lol', <pyspark.resultiterable.ResultIterable at 0x7fd961471630>),
 ('read', <pyspark.resultiterable.ResultIterable at 0x7fd9614715f8>),
 ('me', <pyspark.resultiterable.ResultIterable at 0x7fd961471320>),
 ('please.', <pyspark.resultiterable.ResultIterable at 0x7fd9614714a8>)]

In [45]:
pairRDD.keys().collect()

['how',
 'are',
 'you',
 'hello',
 'world',
 'how',
 'dare',
 'you',
 'fuck',
 'you',
 'motherfucker',
 'lol',
 'read',
 'me',
 'please.']

In [47]:
pairRDD.values().collect()

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [48]:
pairRDD.sortByKey().collect()

[('are', 1),
 ('dare', 1),
 ('fuck', 1),
 ('hello', 1),
 ('how', 1),
 ('how', 1),
 ('lol', 1),
 ('me', 1),
 ('motherfucker', 1),
 ('please.', 1),
 ('read', 1),
 ('world', 1),
 ('you', 1),
 ('you', 1),
 ('you', 1)]

In [50]:
pairRDD.mapValues(lambda x:x+1).collect()

[('how', 2),
 ('are', 2),
 ('you', 2),
 ('hello', 2),
 ('world', 2),
 ('how', 2),
 ('dare', 2),
 ('you', 2),
 ('fuck', 2),
 ('you', 2),
 ('motherfucker', 2),
 ('lol', 2),
 ('read', 2),
 ('me', 2),
 ('please.', 2)]

In [51]:
p1 = sc.parallelize([('spark',1),('spark',2),('hadoop',3),('hadoop',5)])
p2 = sc.parallelize([('spark','fast')])
p1.join(p2).collect()

[('spark', (1, 'fast')), ('spark', (2, 'fast'))]

In [52]:
rdd = sc.parallelize([("spark",2),("hadoop",6),("hadoop",4),("spark",6)])

In [53]:
rdd.mapValues(lambda x : (x,1)).reduceByKey(lambda x,y : (x[0]+y[0],x[1] + y[1])).mapValues(lambda x : (x[0] / x[1])).collect()

[('spark', 4.0), ('hadoop', 5.0)]

### 广播变量
允许我们在每个机器上缓存一个只读的变量，通过这种方式，就可以高效地给每个节点提供一个大的输入数据集的副本

In [54]:
broadcastVar = sc.broadcast([1,2,3,3,4])

In [55]:
broadcastVar.value

[1, 2, 3, 3, 4]

### 累加器

In [56]:
accum = sc.accumulator(0)

In [57]:
sc.parallelize([1,2,3,4]).foreach(lambda x:accum.add(x))

In [58]:
accum.value

10

In [59]:
ls

[0m[01;34manaconda3[0m/                       [01;34mspark-2.2.3-bin-hadoop2.7[0m/
Anaconda3-5.0.0-Linux-x86_64.sh  [01;31mspark-2.2.3-bin-hadoop2.7.tgz[0m
donald.json                      [01;34mspark-warehouse[0m/
first pyspark.ipynb              test.py
hillary.json                     tweets.json
hs_err_pid2511.log               tweets.json.1
hs_err_pid2996.log               [01;34mword2vecM_simple[0m/
hs_err_pid3589.log               [01;31mword2vecM_simple.zip[0m
hs_err_pid3741.log               WordCount.ipynb
hs_err_pid7525.log               word.txt
PysparkPratice-1.ipynb           [01;34mwriteback[0m/


In [60]:
jsonstr = sc.textFile('donald.json')

In [65]:
jsonstr.take(10)

['{',
 '  "count": 99, ',
 '  "results": [',
 '    {',
 '      "id": 1, ',
 '      "polarity": 1, ',
 '      "text": "Source: Ivanka Trump making calls on child care legislation https://t.co/e7Dk6AhW7f", ',
 '      "user_location": "Washington", ',
 '      "subjectivity": 0.0',
 '    }, ']

In [66]:
import json

In [69]:
res = jsonstr.map(lambda s:json.loads(s))