In [2]:
sc

In [48]:
# -*- coding: utf-8 -*-

import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import numpy

class Utils():
    def __init__(self):
        pass

    # 敘述性統計：平均數 標準差
    def getStatValue(self, df, fieldName):
        stat = df.select(avg(fieldName), stddev(fieldName)).collect()
        return stat[0]

class LoadSavedData(Utils):
    # 繼承
    def __init__(self):
        Utils.__init__(self)
        # 執行和Utils的__init__一樣的動作
        
    # 載入資料集檔案
    def loadData(self, dataFile):
        sql = 'SELECT * FROM parquet.`%s`' % dataFile
        # SELECT * FROM parquet.datafile
        df = sqlContext.sql(sql)
        return df

    # 列印敘述性統計
    def printStats(self, df, fields=None):
        if fields is None:
            df.describe().show()
        else:
            for field in fields:
                df.describe(field).show()

# 對照轉換欄位相應值函數
def changeValue(shape):
    idShape = 0
    if shape == 'Iris-versicolor':
        idShape = 1
    elif shape == 'Iris-virginica':
        idShape = 2

    return idShape
    # 將傳過來的shape 回傳idShape

def mn1(m1,n1):
    return m1*n1
def mn2(m2,n2):
    return m2/n2

# 主程式
def main(dataDir):
    # 資料欄位名稱
    fields = ['m1', 'm2', 'n1', 'n2', 'shape']

    # 類別初始化
    worker = LoadSavedData()

    # 載入資料集
    df = worker.loadData(dataFile='%s/iris.parquet' % dataDir)

    # 萃取欄位不重複值
    shapes = df.select(fields[4]).distinct().collect()
    for shape in shapes:
        print(shape[0])
        # distinct 為去除重複 去除field[4]->shape的重複

    # 自訂函數：對照轉換欄位相應值
    myUdf = udf(changeValue, IntegerType())
    # myUdf 的變數為自訂
    # udf 為DataFrame的自訂函式
    # 第一個值為要用的函式 第二個值為函式抓出來值的資料格
    
    mymn1 = udf(mn1, FloatType())
    mymn2 = udf(mn2, FloatType())

    # 對照轉換欄位相應值，衍生新欄位，取代原資料集
    df = df.withColumn('label', myUdf('shape')) \
           .withColumn('mn1',mymn1("m1","n1")) \
           .withColumn("mn2",mymn2("m2","n2"))
    # withColumn為dataframe 新增欄位的語法
    # 第一個為心欄位的名稱 第二個為心欄位的值
    
    #df = df.where("mn1>3 and mn2 >20")
    #df = df.where(col("mn1")>3).where(col("mn2")>20)
    # 使用where將要篩選的值當成字串
    
    mmnn1 = numpy.array(df.select('mn1').collect())
    mmnn1_mean = mmnn1.mean()
    
    data1 = df.where(col("mn1")>mmnn1_mean) \
           .sort(desc("mn1")) \
           .limit(3) \
           .collect()
        
    data2 = df.where(col("mn1")>mmnn1_mean) \
           .limit(3) \
           .collect()
    print(data1)
    #print(data2)
    
    df.show()

    # 萃取衍生欄位不重複值
    idShapes = df.select('label').distinct().collect()
    for idShape in idShapes:
        print(idShape[0])
    # 已經新增欄位了,心欄位的label一樣可以使用distinct 去除重複

    # 列印敘述性統計：平均數 標準差
    for field in fields[1:4]:
        stat = worker.getStatValue(df, field)
        print('%8s\t%.3f\t%.3f' % (field, stat[0], stat[1]))

    # 列印敘述性統計
    worker.printStats(df, fields[1:4])

    # 保存資料集至指定目錄下
    df.write.mode('overwrite').save('%s/iris2.parquet' % dataDir,
                                        format='parquet')


# 程式進入點
if __name__ == '__main__':
    global sc, sqlContext

    # 本地資源運算
    appName = 'Cup-01'
    master = 'local'

    #sc = SparkContext(conf=SparkConf().setAppName(appName).setMaster(master))
    # 如果不市使用jupyter就需要 jupyter會預先設定好

    # 取得資料庫介面
    sqlContext = SQLContext(sc)

    # 調用主程式
    homeDir = os.environ['HOME']
    dirName = 'Data'
    sampleDir = '%s/Sample' % homeDir
    dataDir = '%s/Data' % homeDir

    main(dataDir)


Iris-virginica
Iris-setosa
Iris-versicolor
Row(m1=7.7, m2=2.6, n1=6.9, n2=2.3, shape='Iris-virginica', label=2, mn1=53.130001068115234, mn2=1.1304347515106201)
+---+---+---+---+-----------+-----+----+---------+
| m1| m2| n1| n2|      shape|label| mn1|      mn2|
+---+---+---+---+-----------+-----+----+---------+
|5.1|3.5|1.4|0.2|Iris-setosa|    0|7.14|     17.5|
|4.9|3.0|1.4|0.2|Iris-setosa|    0|6.86|     15.0|
|4.7|3.2|1.3|0.2|Iris-setosa|    0|6.11|     16.0|
|4.6|3.1|1.5|0.2|Iris-setosa|    0| 6.9|     15.5|
|5.0|3.6|1.4|0.2|Iris-setosa|    0| 7.0|     18.0|
|5.4|3.9|1.7|0.4|Iris-setosa|    0|9.18|     9.75|
|4.6|3.4|1.4|0.3|Iris-setosa|    0|6.44|11.333333|
|5.0|3.4|1.5|0.2|Iris-setosa|    0| 7.5|     17.0|
|4.4|2.9|1.4|0.2|Iris-setosa|    0|6.16|     14.5|
|4.9|3.1|1.5|0.1|Iris-setosa|    0|7.35|     31.0|
|5.4|3.7|1.5|0.2|Iris-setosa|    0| 8.1|     18.5|
|4.8|3.4|1.6|0.2|Iris-setosa|    0|7.68|     17.0|
|4.8|3.0|1.4|0.1|Iris-setosa|    0|6.72|     30.0|
|4.3|3.0|1.1|0.1|Iris-se