In [1]:
import findspark
findspark.init('/home/seanzhen52/spark-2.2.3-bin-hadoop2.7/')

In [2]:
from pyspark import SparkContext
sc = SparkContext('local')

In [3]:
cat people.json

{"name":"Michael"}
{"name":"Andy","age":30}
{"name":"Justin","age":19}


SparkSession支持从不同的数据源加载数据，并把数据转换成DataFrame，并且支持把DataFrame转换成SQLContext自身中的表，然后使用SQL语句来操作数据。SparkSession亦提供了HiveQL以及其他依赖于Hive的功能的支持。

In [9]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.read.json('people.json')

In [6]:
#打印模式信息
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [8]:
#选择多列
# df.select(df.name,df.age+1).filter(df.age>20).show()
df.select(df.name,df.age+1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [9]:
test = df.select(df.name,df.age+1)

In [10]:
test.collect()

[Row(name='Michael', (age + 1)=None),
 Row(name='Andy', (age + 1)=31),
 Row(name='Justin', (age + 1)=20)]

In [10]:
#条件过滤
df.filter(df.age>20).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [11]:
#分组聚合
df.groupBy('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [12]:
#排序
df.sort(df.age.desc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [14]:
#多列排序
df.sort(df.age.desc(),df.name.asc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [16]:
#对列进行重命名
df.select(df.name.alias('username')).show()

+--------+
|username|
+--------+
| Michael|
|    Andy|
|  Justin|
+--------+



### RDD to DATAFRAME
- 利用反射来推断包含特定对象的RDD的schema，适用对已知数据结构的RDD转换，会用到toDF()
- 使用编程接口，构造一个schema并将其应用在已知的RDD上

In [3]:
from pyspark.sql.types import Row

In [17]:
cat people.txt

Michael,29
Andy,30
Justin,19


In [4]:
def f(x):
    rel = {}
    rel['name'] = x[0]
    rel['age'] = x[1]
    return rel

toDF方法是在SparkSession(1.x中的SQLContext构造函数)构造函数中执行的一个猴子补丁，以便能够使用它，您必须先创建一个SQLContext(或SparkSession)

In [11]:
spark = SparkSession(sc)

In [12]:
peopleDF = sc.textFile('people.txt').map(lambda line:line.split(',')).map(lambda x:Row(**f(x))).toDF()

In [13]:
peopleDF.createOrReplaceTempView('people')#必须注册为临时表才能供查询使用

In [14]:
personDF = spark.sql("select * from people")

In [16]:
personDF.show()

+---+-------+
|age|   name|
+---+-------+
| 29|Michael|
| 30|   Andy|
| 19| Justin|
+---+-------+



In [20]:
personDF.rdd.map(lambda t:'age:'+t[0]+','+'name:'+t[1]).collect()

['age:29,name:Michael', 'age:30,name:Andy', 'age:19,name:Justin']

- 使用编程方式定义RDD模式

In [28]:
from pyspark.sql.types import Row
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

In [22]:
peopleRDD = sc.textFile('people.txt')#生成RDD

In [23]:
#定义一个模式字符串
schmaString = 'name age'

In [25]:
#根据模式字符串生成模式
fields = list(map(lambda fieldName:StructField(fieldName,StringType(),nullable = True),schmaString.split(" ")))

In [29]:
schema = StructType(fields)

In [30]:
schema#描述了模式信息，模式中包含了name和age两个字段

StructType(List(StructField(name,StringType,true),StructField(age,StringType,true)))

In [31]:
rowRDD = peopleRDD.map(lambda x:x.split(',')).map(lambda s:Row(s[0],s[1]))

In [32]:
spark = SparkSession(sc)

In [34]:
peopleDF = spark.createDataFrame(rowRDD,schema)

In [35]:
peopleDF.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [39]:
peopleDF.createOrReplaceTempView('lalala')

In [40]:
results = spark.sql('select * from lalala')

In [41]:
results.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [47]:
results.rdd.collect()

[Row(name='Michael', age='29'),
 Row(name='Andy', age='30'),
 Row(name='Justin', age='19')]

In [48]:
results.rdd.map( lambda attributes : "name: " + attributes[0]+","+"age:"+attributes[1]).collect()

['name: Michael,age:29', 'name: Andy,age:30', 'name: Justin,age:19']

### RDD保存成文件

- 第一种

In [49]:
peopleDF = spark.read.format("json").load("people.json")

In [50]:
peopleDF.select('name','age').write.format('csv').save('newpeople.csv')

只有select()中只存在一个列时，才允许保存成文本文件，如果存在两个列，比如select(“name”, “age”)，就不能保存成文本文件。

In [51]:
ls

[0m[01;34manaconda3[0m/                       PysparkPratice-1.ipynb
Anaconda3-5.0.0-Linux-x86_64.sh  PysparkPratice-2.ipynb
donald.json                      [01;34mspark-2.2.3-bin-hadoop2.7[0m/
first pyspark.ipynb              [01;31mspark-2.2.3-bin-hadoop2.7.tgz[0m
hillary.json                     [01;34mspark-warehouse[0m/
hs_err_pid2511.log               test.py
hs_err_pid2996.log               tweets.json
hs_err_pid3589.log               tweets.json.1
hs_err_pid3741.log               [01;34mword2vecM_simple[0m/
hs_err_pid7525.log               [01;31mword2vecM_simple.zip[0m
[01;34mnewpeople.csv[0m/                   WordCount.ipynb
people.json                      word.txt
people.txt                       [01;34mwriteback[0m/


In [52]:
cd newpeople.csv/

/home/seanzhen52/newpeople.csv


In [53]:
ls

part-00000-9cdad812-9036-48b9-ab65-513e78071eb3-c000.csv  _SUCCESS


In [54]:
cat part-00000-9cdad812-9036-48b9-ab65-513e78071eb3-c000.csv

Michael,
Andy,30
Justin,19


In [55]:
cd ..

/home/seanzhen52


In [56]:
#重新加载
textFile = sc.textFile('newpeople.csv')

In [57]:
textFile.collect()

['Michael,', 'Andy,30', 'Justin,19']

- 第二种

In [58]:
 peopleDF = spark.read.format("json").load("people.json")

In [59]:
#把dataframe转换成RDD，然后用saveAsTextFile()保存成文本文件
peopleDF.rdd.saveAsTextFile('newpeople.txt')

In [62]:
cd newpeople.txt/

/home/seanzhen52/newpeople.txt


In [63]:
cat part-00000

Row(age=None, name='Michael')
Row(age=30, name='Andy')
Row(age=19, name='Justin')


In [64]:
cd ..

/home/seanzhen52
