## How to create the spark context

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession


In [2]:
Spark = SparkSession.builder.master("local[*]").appName("DemoLab").getOrCreate()


In [3]:
Spark

In [4]:
sc = Spark.sparkContext

In [5]:
sc


In [6]:
print("SparkSession created:", Spark)
print("SparkContext:", sc)


SparkSession created: <pyspark.sql.session.SparkSession object at 0x000002C375CC3E90>
SparkContext: <SparkContext master=local[*] appName=DemoLab>


In [7]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
print(rdd.map(lambda x: x * 2).collect())


[2, 4, 6, 8, 10]


In [8]:
df = Spark.read.csv("Automobile_data.csv")

In [9]:
df.show()

+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+
|      _c0|              _c1|        _c2|      _c3|       _c4|         _c5|        _c6|         _c7|            _c8|       _c9|  _c10| _c11|  _c12|       _c13|       _c14|            _c15|       _c16|       _c17|_c18|  _c19|             _c20|      _c21|    _c22|    _c23|       _c24| _c25|
+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+
|symboling|normalized-losses|       make|fuel-type|aspiration|num-of-doors| body-style|drive-wheels|engine-location|wheel-base|len

In [10]:
#Spark.stop()

## DataSet require compile time type saftey, but Python is dynamically typed
# So Pyspark supports
1. RDD
2. Dataframes

## Operations on RDD (transformation & Actions))

In [13]:
sc

In [14]:
data = [1,2,3,4,5]

In [15]:
rdd = sc.parallelize(data)

In [16]:
rdd1 = rdd.map(lambda x: x*5)

In [18]:
print(type(rdd1))

<class 'pyspark.rdd.PipelinedRDD'>


In [19]:
rdd1.collect()

[5, 10, 15, 20, 25]

In [22]:
rdd2 = rdd.filter(lambda x: x%2==0)

In [23]:
rdd2.collect()

[2, 4]

In [40]:
data1= [("a",1),("b",2),("a",3),("b",4)]

In [41]:
rdd3 = sc.parallelize(data1)

In [42]:
result2 = rdd3.groupByKey()

In [43]:
result3 = result2.mapValues(list).collect()
print(result3)

[('a', [1, 3]), ('b', [2, 4])]


In [31]:
data = [("apple", 1), ("banana", 2), ("apple", 3), ("orange", 4), ("banana", 5)]
rdd = sc.parallelize(data)

In [32]:
grouped_rdd = rdd.groupByKey()

In [33]:
result = grouped_rdd.mapValues(list).collect()
print(result)

[('apple', [1, 3]), ('banana', [2, 5]), ('orange', [4])]
