In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

sc = spark.sparkContext

In [3]:
!head ./flights.csv


1,3,4,2003,2211,WN,335,N712SW,128,116,-14,8,IAD,TPA,810,4,8,0,,0
1,3,4,926,1054,WN,1746,N612SW,88,78,-6,-4,IND,BWI,515,3,7,0,,0
1,3,4,1940,2121,WN,378,N726SW,101,87,11,25,IND,JAX,688,4,10,0,,0
1,3,4,1937,2037,WN,509,N763SW,240,230,57,67,IND,LAS,1591,3,7,0,,0
1,3,4,754,940,WN,1144,N778SW,226,205,-15,9,IND,PHX,1489,5,16,0,,0
1,3,4,1422,1657,WN,188,N215WN,155,143,47,87,ISP,FLL,1093,6,6,0,,0
1,3,4,1954,2239,WN,1754,N243WN,165,155,4,29,ISP,FLL,1093,3,7,0,,0
1,3,4,636,921,WN,2275,N454WN,165,147,-24,1,ISP,FLL,1093,5,13,0,,0
1,3,4,2107,2334,WN,362,N798SW,147,134,64,82,ISP,MCO,972,6,7,0,,0
1,3,4,1312,1546,WN,1397,N247WN,154,140,-4,12,ISP,MCO,972,7,7,0,,0


In [4]:
!pwd

/home/talentum/test-jupyter/test/PairRdd


In [5]:
file_path = 'file:///home/talentum/test-jupyter/test/PairRdd/flights.csv'
rdd = sc.textFile(file_path)

In [6]:
parsedRdd = rdd.map(lambda line: line.split(","))

In [7]:
flightOrigDestRdd = parsedRdd.map(lambda columns: (columns[12], columns[13]))  # (origin, destination)
print("Flight Origin-Destination RDD:")
print(flightOrigDestRdd.take(5))

Flight Origin-Destination RDD:
[('IAD', 'TPA'), ('IND', 'BWI'), ('IND', 'JAX'), ('IND', 'LAS'), ('IND', 'PHX')]


In [8]:
cityRdd = parsedRdd.flatMap(lambda columns: [(columns[12], columns[12]), (columns[13], columns[13])])  # (code, code)
print("City RDD:")
print(cityRdd.take(5))

City RDD:
[('IAD', 'IAD'), ('TPA', 'TPA'), ('IND', 'IND'), ('BWI', 'BWI'), ('IND', 'IND')]


In [9]:
origJoinRdd = flightOrigDestRdd.join(cityRdd)  # Join based on airport codes
print("Origin Join RDD:")
print(origJoinRdd.take(5))

Origin Join RDD:
[('LAS', ('ABQ', 'LAS')), ('LAS', ('ABQ', 'LAS')), ('LAS', ('ABQ', 'LAS')), ('LAS', ('ABQ', 'LAS')), ('LAS', ('ABQ', 'LAS'))]


In [10]:
citiesCleanedRdd = origJoinRdd.values()  # Extract joined values
print("Cleaned Cities RDD:")
print(citiesCleanedRdd.take(5))

Cleaned Cities RDD:
[('ABQ', 'LAS'), ('ABQ', 'LAS'), ('ABQ', 'LAS'), ('ABQ', 'LAS'), ('ABQ', 'LAS')]


In [11]:
citiesKV = citiesCleanedRdd.map(lambda city: (city, 1))
print("Cities Key-Value RDD:")
print(citiesKV.take(5))

Cities Key-Value RDD:
[(('ABQ', 'LAS'), 1), (('ABQ', 'LAS'), 1), (('ABQ', 'LAS'), 1), (('ABQ', 'LAS'), 1), (('ABQ', 'LAS'), 1)]


In [None]:
citiesReduceSortedRdd = citiesKV.reduceByKey(lambda x, y: x + y).map(lambda x: (x[1], x[0])).sortByKey(ascending=False)
print("Reduced and Sorted Cities RDD:")
print(citiesReduceSortedRdd.take(5))

In [4]:
file_path = 'file:///home/talentum/test-jupyter/test/PairRdd/flights.csv'
delayRDD = sc.textFile(file_path).map(
    lambda val: val.split(',')
).filter(
    lambda delay: int(delay[11])> 15
).map(
    lambda column: (column[5], column[11]) 
)

delayRDD.take(5)

[('WN', '25'), ('WN', '67'), ('WN', '87'), ('WN', '29'), ('WN', '82')]

In [5]:
delayMaxRDD = delayRDD.reduceByKey(
    lambda x,y: max(int(x), int(y))
)

delayMaxRDD.take(5)

[('XE', 781), ('YV', 526), ('OH', 680), ('OO', 767), ('UA', 1268)]

In [6]:
file_path = 'file:///home/talentum/test-jupyter/test/PairRdd/plane-data.csv'
plabeDataRdd = sc.textFile(file_path) 
plabeDataRdd.count()

5030

In [7]:
cleanPlabeDataRdd = plabeDataRdd.map(
    lambda val: val.split(',')
).filter(
    lambda element: len(element)==9
)

cleanPlabeDataRdd.count()

4481