Topics Covered:
    1. Creating Spark SQL Session
    2. Creating Spark Context
    3. Using RDD, map and take
    4. Reading text file and applying actions
    5. Examples: Transformations
    6. Connecting to MySQL
    

# 00. Getting Started

In [23]:
!pip install -q findspark

In [24]:
import findspark
findspark.init()

# 01. Creating a Spark SQL Session

In [5]:
#from pyspark.sql import SparkSession
#spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
#spark.stop()

# 02. Creating a Spark Context

In [3]:
#import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
conf = SparkConf()
conf.setMaster('local')
conf.setAppName('spark-basic')

<pyspark.conf.SparkConf at 0x219950780c8>

In [4]:
sc = SparkContext("local", "spark-basic")
#sc = SparkContext(conf=conf)

# 03. Using RDD, map and take

In [5]:
rdd = sc.parallelize(range(1000)).map(lambda x:(x,x%2)).take(10)
print(rdd)

[(0, 0), (1, 1), (2, 0), (3, 1), (4, 0), (5, 1), (6, 0), (7, 1), (8, 0), (9, 1)]


# 04. Reading text file and applying actions

Reading TXT File and use pre-defined functions:
count, collect,first,take,takeSample,

In [6]:
# Reading a text file:
txt = sc.textFile('hello.txt')
type(txt)

pyspark.rdd.RDD

In [7]:
txt.collect() # rdd got converted to list format.
print("first(): ",txt.first())
print("count(): ",txt.count())
print("take(): ",txt.take(5))
print("takeSample(): ",txt.takeSample(False,5,1)) # takes 5 samples with or without(True or False) replacement.


first():  Line 1
count():  1244
take():  ['Line 1', 'Line 2', 'Line 3', 'Line 4', 'Line 5']
takeSample():  ['Last Line it is.', 'This is line number two', 'Welcome to PySpark!!', 'Last Line it is.', 'Line 3']


# 05. Examples: Transformations

transformations: map,flastmap,filter,mapPartitions,mapPartitionsWithIndex,sample,union,intersection,distinct groupBy,keyBy,Zip,zipwithIndex,Coalesce,Repartition,sortBy

In [8]:
#Using map:
x = sc.parallelize(["Red","Green","Blue","Yellow"])
y = x.map(lambda x:(x+str(2)))
y.collect()

['Red2', 'Green2', 'Blue2', 'Yellow2']

In [9]:
#Using flatMap:
x = sc.parallelize([2,3,4])
y = x.flatMap(lambda x:range(1,x)).collect()
print(y)

[1, 1, 2, 1, 2, 3]


In [10]:
# Using Filter:
x = sc.parallelize([2,5,6,8,1,5,8,9,6,3])
y = x.filter(lambda x : x%2==0).collect()
y

[2, 6, 8, 8, 6]

In [11]:
#Take one sample without replacement from each cluster:
x = sc.parallelize(range(1,10))
print(x.sample(False,0.8,2).collect()) # (with or without replacement,fraction,seed)
print(x.sample(False,1,2).collect()) 

[2, 5, 6, 8]
[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [12]:
#Union&intersection:
x = sc.parallelize(range(1,9))
y = sc.parallelize(range(5,15))
z = x.union(y).collect()
z2 = x.intersection(y).collect()
print("Union: ",z," Intersection: ",z2)

Union:  [1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]  Intersection:  [6, 8, 5, 7]


In [13]:
#sortBy:
#x = sc.parallelize([2,8,9,5,4,7,1,1,2,3])
#x.sortBy(lambda x:x,True).collect()
y = sc.parallelize([("F",222),("Z",28),("I",15),("D",10)])
y.sortBy(lambda x:x,True).collect()

[('D', 10), ('F', 222), ('I', 15), ('Z', 28)]

In [14]:
#mapPartitios:
x = sc.parallelize([1,2,3,4,5,6],2)
def f(cluster): yield sum(cluster)
x.mapPartitions(f).collect()

[6, 15]

In [15]:
#mapPartitios with index:
x = sc.parallelize([1,2,3,4,5,6],5)
def f(index,cluster): yield index
x.mapPartitionsWithIndex(f).sum()

10

In [16]:
# groupBy:
rdd = sc.parallelize([1,1,2,3,5,8])
result = rdd.groupBy(lambda x: x%2).collect()
sorted([(x,sorted(y)) for (x,y) in result])

[(0, [2, 8]), (1, [1, 1, 3, 5])]

In [17]:
#keyBy:
x = sc.parallelize(range(0,3)).keyBy(lambda x:x*x)
y = sc.parallelize(zip(range(0,5), range(0,5)))
[(x, list(map(list,y))) for x,y in sorted(x.cogroup(y).collect())]

[(0, [[0], [0]]),
 (1, [[1], [1]]),
 (2, [[], [2]]),
 (3, [[], [3]]),
 (4, [[2], [4]])]

In [18]:
#zip:
x = sc.parallelize(range(0,5))
y = sc.parallelize(range(1000,1005))
x.zip(y).collect()

[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]

In [19]:
#zip:
sc.parallelize(["a","b","c","d"]).zipWithIndex().collect()

[('a', 0), ('b', 1), ('c', 2), ('d', 3)]

In [20]:
#partitioning:
rdd = sc.parallelize([1,2,3,4,5,6,7],4)
rdd.glom().collect()

[[1], [2, 3], [4, 5], [6, 7]]

In [21]:
rdd.repartition(2).glom().collect()

[[1, 4, 5, 6, 7], [2, 3]]

In [22]:
#Coalesce:
sc.parallelize([1,2,3,4,5],3).glom().collect()

[[1], [2, 3], [4, 5]]

In [23]:
sc.parallelize([1,2,3,4,5],3).coalesce(2).glom().collect()

[[1], [2, 3, 4, 5]]

# 06. Connecting to MySQL

In [36]:
database = "sonuresodb"
table = "lyrics"
user = "root"
password  = ""

jdbcDF = spark.read.format("jdbc").option("url", f"jdbc:sqlserver://localhost:3306;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password).option("driver", "com.mysql.jdbc.Driver") \
    .load()

In [None]:
#====END============