In [1]:
import os
import sys
os.environ['SPARK_HOME']='D:/spark330hdp3sc3'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.9.5-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('TestHive') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'3.3.0'

In [5]:
sc = spark.sparkContext

In [19]:
ardd = sc.parallelize(range(1, 5))
pair_rdd = ardd.map(lambda x: (x, x))
# RDD.reduce(f: Callable[[T, T], T]) → T
print('reduce, ', ardd.reduce(lambda x, y: x + y))
# RDD.collect() → List[T][source]
print('collect, ', ardd.collect())
print('keys, ', pair_rdd.keys().collect())
print('values, ', pair_rdd.values().collect())
# RDD.aggregate(zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]) → U
print('aggregate, ', ardd.aggregate((0, 0), lambda acc, vlu: (acc[0] + 1, acc[1] + vlu),
                                  lambda x, y: (x[0] + y[0], x[1] + y[1])))
# RDD.first() → T
print('first, ' , ardd.first())
# RDD.take(num: int) → List[T]                      
print('take, ', ardd.take(2))
# RDD.top(num: int, key: Optional[Callable[[T], S]] = None) → List[T] 
print('top plain , ', sc.parallelize([10, 17, 21, 200, 2, 14, 16, 5]).top(2))
print('top with function, ', sc.parallelize([10, 17, 21, 200, 2, 14, 16, 5]).top(2, lambda x: -x))
# RDD.sample(withReplacement: bool, fraction: float, seed: Optional[int] = None) → pyspark.rdd.RDD[T]
print('sample, ', ardd.sample(True, 2, 5).collect())

reduce,  10
collect,  [1, 2, 3, 4]
keys,  [1, 2, 3, 4]
values,  [1, 2, 3, 4]
aggregate,  (4, 10)
first,  1
take,  [1, 2]
top plain ,  [200, 21]
top with function,  [2, 5]
sample,  [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4]


In [10]:
# RDD.collectAsMap() → Dict[K, V]
print('collect as map, ', pair_rdd.collectAsMap())

collect as map,  {1: 1, 2: 2, 3: 3, 4: 4}


In [25]:
# RDD.countByKey() → Dict[K, int]
print('count by key ', ardd.flatMap(lambda x: range(x, 5)).map(lambda x: (x, x)).countByKey())
# RDD.countByValue() → Dict[K, int]
print('count by value ', ardd.flatMap(lambda x: range(x, 5)).map(lambda x: (x, x)).countByValue())

count by key  defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 3, 4: 4})
count by value  defaultdict(<class 'int'>, {(1, 1): 1, (2, 2): 2, (3, 3): 3, (4, 4): 4})


In [21]:
# RDD.min(key: Optional[Callable[[T], S]] = None) → T
# RDD.max(key: Optional[Callable[[T], S]] = None) → T
# RDD.mean() → float
print("Min: " , ardd.min() , ", Max: " , ardd.max() , " Sum: " , ardd.sum())
print("Mean: " , ardd.mean() , ", StDev: " , ardd.stdev() , ", Variance: " ,
    ardd.variance())
# RDD.stats() → pyspark.statcounter.StatCounter
print("All together in stats: " , ardd.stats())

Min:  1 , Max:  4  Sum:  10
Mean:  2.5 , StDev:  1.118033988749895 , Variance:  1.25
All together in stats:  (count: 4, mean: 2.5, stdev: 1.118033988749895, max: 4.0, min: 1.0)


In [28]:
randlist = np.random.randint(0, 100, 30)
print(randlist)
randRDD = sc.parallelize(randlist)
# RDD.histogram(buckets: Union[int, List[S], Tuple[S, …]]) → Tuple[Sequence[S], List[int]]
histogram = randRDD.histogram(3)
print('buckets and frequency of the histogram generated: ', [x for x in zip(histogram[0], histogram[1])])

[75 13 90 88 85 83 84 27 48 45 52 65 19 48 21 60 51 86 50 83  2  0  8 97
 72 43 41 81 56 57]
buckets and frequency of the histogram generated:  [(0.0, 7), (32.333333333333336, 11), (64.66666666666667, 12)]


In [32]:
new_list = [1, 3, 10, 11, 12, 2, 9,12, 12, 12, 12, 12, 12, 12,12,12, 14]
print(len(new_list))
sc.parallelize(new_list).histogram([2, 5, 10, 20])

17


([2, 5, 10, 20], [2, 1, 13])

In [33]:
# RDD.countApprox(timeout: int, confidence: float = 0.95) → int
approxRDD = sc.parallelize(range(100000000)).map(lambda x: ('even' if x % 2 == 0 else 'odd', x))
print("countApprox: " , approxRDD.countApprox(100, 0.999))

countApprox:  100000000


In [34]:
print("\nSaving pairRDD to text file - directory will be created with one file for each partition")
save_file_location = 'D:/tmp/prdd_text_py'
pair_rdd.saveAsTextFile(save_file_location)


Saving pairRDD to text file - directory will be created with one file for each partition


In [38]:
# RDD.saveAsTextFile(path: str, compressionCodecClass: Optional[str] = None) → None
print("Loading from the saved text file")
sc.textFile(save_file_location).collect()

Loading from the saved text file


['(1, 1)', '(2, 2)', '(3, 3)', '(4, 4)']

In [73]:
pairDF = pair_rdd.toDF(["k", "v"])
pair
print(pairDF.collect())
pairDF.write.parquet("hdfs://localhost:8020/user/cloudera/prdd_parquet_py")
print('reading from stored parquet file ',
      spark.read.parquet("hdfs://localhost:8020/user/cloudera/prdd_parquet_py").collect())

[Row(k=1, v=1), Row(k=2, v=2), Row(k=3, v=3), Row(k=4, v=4)]
reading from stored parquet file  [Row(k=1, v=1), Row(k=2, v=2), Row(k=3, v=3), Row(k=4, v=4)]
