**Docker Toolbox**

**Pyspark Initialization**

In [5]:
import pyspark 
sc = pyspark.SparkContext('local[*]')

**Getting working directory**

In [25]:
import os
os.getcwd()

'/home/jovyan'

**Loading data**

In [24]:
%%time
rdd = sc.textFile('/home/jovyan/work/data/*.csv.bz2')


CPU times: user 1.87 ms, sys: 25 µs, total: 1.89 ms
Wall time: 56.8 ms


In [26]:
rdd.take(3)

['Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay',
 '2006,1,11,3,743,745,1024,1018,US,343,N657AW,281,273,223,6,-2,ATL,PHX,1587,45,13,0,,0,0,0,0,0,0',
 '2006,1,11,3,1053,1053,1313,1318,US,613,N834AW,260,265,214,-5,0,ATL,PHX,1587,27,19,0,,0,0,0,0,0,0']

**Getting Total Counts**

In [27]:
rdd.count()

123534991

**Removing Header**

In [28]:
header = rdd.first()
rdd2=rdd.filter(lambda line: line != header)
rdd2.take(2)

['2006,1,11,3,743,745,1024,1018,US,343,N657AW,281,273,223,6,-2,ATL,PHX,1587,45,13,0,,0,0,0,0,0,0',
 '2006,1,11,3,1053,1053,1313,1318,US,613,N834AW,260,265,214,-5,0,ATL,PHX,1587,27,19,0,,0,0,0,0,0,0']

**Implementing Mapreduce**

In [31]:
rdd3= rdd2.map(lambda x: x.split(',')[8])
rdd3.take(10)

['US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US', 'US']

In [32]:
rdd4=rdd3.map(lambda a:(a,1)).reduceByKey(lambda a,b: a+b)

**Total number of UniqueCarrier**

In [33]:
rdd4.take(30)

[('MQ', 3954895),
 ('PI', 873957),
 ('XE', 2350309),
 ('AQ', 154381),
 ('AA', 14984647),
 ('PS', 83617),
 ('AS', 2878021),
 ('DL', 16547870),
 ('UA', 13299817),
 ('FL', 1265138),
 ('NW', 10292627),
 ('DH', 693047),
 ('EA', 919785),
 ('HA', 274265),
 ('F9', 336958),
 ('WN', 15976022),
 ('PA (1)', 316167),
 ('US', 14075530),
 ('OO', 3090853),
 ('CO', 8145788),
 ('HP', 3636682),
 ('OH', 1464176),
 ('TZ', 208420),
 ('B6', 811341),
 ('ML (1)', 70622),
 ('YV', 854056),
 ('EV', 1697172),
 ('TW', 3757747),
 ('9E', 521059)]

**Through Dataframe Implementation**

In [34]:
import pyspark
from pyspark.sql import SparkSession

In [36]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("docker_Spark") \
    .getOrCreate()

**Reading Data**

In [37]:
DF = spark.read.csv('/home/jovyan/work/data/*.csv.bz2', header="true", inferSchema="true")

**Total Counts of UniqueCarrier**

In [38]:
sorted(DF.groupBy(['UniqueCarrier']).count().collect())

[Row(UniqueCarrier='9E', count=521059),
 Row(UniqueCarrier='AA', count=14984647),
 Row(UniqueCarrier='AQ', count=154381),
 Row(UniqueCarrier='AS', count=2878021),
 Row(UniqueCarrier='B6', count=811341),
 Row(UniqueCarrier='CO', count=8145788),
 Row(UniqueCarrier='DH', count=693047),
 Row(UniqueCarrier='DL', count=16547870),
 Row(UniqueCarrier='EA', count=919785),
 Row(UniqueCarrier='EV', count=1697172),
 Row(UniqueCarrier='F9', count=336958),
 Row(UniqueCarrier='FL', count=1265138),
 Row(UniqueCarrier='HA', count=274265),
 Row(UniqueCarrier='HP', count=3636682),
 Row(UniqueCarrier='ML (1)', count=70622),
 Row(UniqueCarrier='MQ', count=3954895),
 Row(UniqueCarrier='NW', count=10292627),
 Row(UniqueCarrier='OH', count=1464176),
 Row(UniqueCarrier='OO', count=3090853),
 Row(UniqueCarrier='PA (1)', count=316167),
 Row(UniqueCarrier='PI', count=873957),
 Row(UniqueCarrier='PS', count=83617),
 Row(UniqueCarrier='TW', count=3757747),
 Row(UniqueCarrier='TZ', count=208420),
 Row(UniqueCarrier=