In [1]:
import os
import sys
os.environ['SPARK_HOME']='/usr/lib/spark'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.7-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('TestHive') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'2.3.0'

In [5]:
sc = spark.sparkContext

In [19]:
ardd = sc.parallelize(range(1, 5))
pair_rdd = ardd.map(lambda x: (x, x))
print('reduce, ', ardd.reduce(lambda x, y: x + y))
print('collect, ', ardd.collect())
print('keys, ', pair_rdd.keys().collect())
print('values, ', pair_rdd.values().collect())
print('aggregate, ', ardd.aggregate((0, 0), lambda acc, vlu: (acc[0] + 1, acc[1] + vlu),
                                  lambda x, y: (x[0] + y[0], x[1] + y[1])))
print('first, ' , ardd.first())
print('take, ', ardd.take(2))
print('top, ', ardd.sortBy(lambda x: x, False).top(2))
# def sample(withReplacement: Boolean, fraction: Double, seed: Int): RDD[T]
print('sample, ', ardd.sample(True, 2, 5).collect())

reduce,  10
collect,  [1, 2, 3, 4]
keys,  [1, 2, 3, 4]
values,  [1, 2, 3, 4]
aggregate,  (4, 10)
first,  1
take,  [1, 2]
top,  [4, 3]
samole,  [1, 1, 2, 2, 3, 3, 3, 4, 4, 4]


In [21]:
print('collect as map, ', pair_rdd.collectAsMap())

collect as map,  {1: 1, 2: 2, 3: 3, 4: 4}


In [25]:
print('count by key ', ardd.flatMap(lambda x: range(x, 5)).map(lambda x: (x, x)).countByKey())
print('count by value ', ardd.flatMap(lambda x: range(x, 5)).map(lambda x: (x, x)).countByValue())

count by key  defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 3, 4: 4})
count by value  defaultdict(<class 'int'>, {(1, 1): 1, (2, 2): 2, (3, 3): 3, (4, 4): 4})


In [29]:
arddDbl = sc.parallelize([float(x) for x in range(1, 5)])


In [40]:
print("Min: " , ardd.min() , ", Max: " , ardd.max() , " Sum: " , ardd.sum())
print("Mean: " , arddDbl.mean() , ", StDev: " , arddDbl.stdev() , ", Variance: " ,
    arddDbl.variance())
print("All together in stats: " , arddDbl.stats())

Min:  1 , Max:  4  Sum:  10
Mean:  2.5 , StDev:  1.118033988749895 , Variance:  1.25
All together in stats:  (count: 4, mean: 2.5, stdev: 1.118033988749895, max: 4.0, min: 1.0)


In [44]:
randlist = np.random.randint(0, 100, 100)
randRDD = sc.parallelize(randlist)
histogram = randRDD.histogram(10)
print('buckets and frequency of the histogram generated: ', [x for x in zip(histogram[0], histogram[1])])

buckets and frequency of the histogram generated:  [(0.0, 14), (9.9, 8), (19.8, 14), (29.700000000000003, 7), (39.6, 8), (49.5, 5), (59.400000000000006, 14), (69.3, 10), (79.2, 9), (89.10000000000001, 11)]


In [49]:
approxRDD = sc.parallelize(range(100000000)).map(lambda x: ('even' if x % 2 == 0 else 'odd', x))
print("countApprox: " , approxRDD.countApprox(100, 0.999))


countApprox:  100000000


In [51]:
print("\nSaving pairRDD to text file - directory will be created with one file for each partition")
pair_rdd.saveAsTextFile("hdfs://localhost:8020/user/cloudera/prdd_text_py")


Saving pairRDD to text file - directory will be created with one file for each partition


In [52]:
print("Loading from the saved text file")
sc.textFile("hdfs://localhost:8020/user/cloudera/prdd_text_py").collect()

Loading from the saved text file


['(1, 1)', '(2, 2)', '(3, 3)', '(4, 4)']

In [55]:
hdp_rdd = sc.newAPIHadoopFile("hdfs://localhost:8020/user/cloudera/prdd_text_py",
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text')

In [59]:
print(pair_rdd.collect())
hdp_rdd.map(lambda x: (x[0], x[1])).collect()

[(1, 1), (2, 2), (3, 3), (4, 4)]


[(0, '(1, 1)'), (7, '(2, 2)'), (0, '(3, 3)'), (7, '(4, 4)')]

In [65]:
print("\nSaving pairRDD to object file - directory will be created with one file for each partition")
pair_rdd.saveAsSequenceFile("hdfs://localhost:8020/user/cloudera/prdd_sequence_py")
print("Loading from the saved sequence file")
sc.sequenceFile("hdfs://localhost:8020/user/cloudera/prdd_sequence_py").collect()


Saving pairRDD to object file - directory will be created with one file for each partition
Loading from the saved sequence file


[(1, 1), (2, 2), (3, 3), (4, 4)]

In [73]:
pairDF = pair_rdd.toDF(["k", "v"])
print(pairDF.collect())
pairDF.write.parquet("hdfs://localhost:8020/user/cloudera/prdd_parquet_py")
print('reading from stored parquet file ',
      spark.read.parquet("hdfs://localhost:8020/user/cloudera/prdd_parquet_py").collect())

[Row(k=1, v=1), Row(k=2, v=2), Row(k=3, v=3), Row(k=4, v=4)]
reading from stored parquet file  [Row(k=1, v=1), Row(k=2, v=2), Row(k=3, v=3), Row(k=4, v=4)]
