### Import pyspark

In [2]:
import pyspark
from pyspark.sql import SparkSession

### Create SparkSession

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("docker_Spark") \
    .getOrCreate()

### Import Data

In [4]:
%%time
data = spark.read.csv('*.csv.bz2',inferSchema=True, header=True)

CPU times: user 30 ms, sys: 30 ms, total: 60 ms
Wall time: 5min 46s


In [5]:
data.take(1)

[Row(Year=2007, Month=1, DayofMonth=1, DayOfWeek=1, DepTime='1232', CRSDepTime=1225, ArrTime='1341', CRSArrTime=1340, UniqueCarrier='WN', FlightNum=2891, TailNum='N351', ActualElapsedTime='69', CRSElapsedTime='75', AirTime='54', ArrDelay='1', DepDelay='7', Origin='SMF', Dest='ONT', Distance='389', TaxiIn='4', TaxiOut='11', Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay='0', WeatherDelay='0', NASDelay='0', SecurityDelay='0', LateAircraftDelay='0')]

In [6]:
data.show(5)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|2007|    1|         1|        1|   1232|      1225|   1341|      1340|           WN|     2891

### Show Schema

In [7]:
data.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Carr

### Carrier Count

In [8]:
%%time
data.groupBy("UniqueCarrier").count().orderBy("count", ascending=False).show()

+-------------+--------+
|UniqueCarrier|   count|
+-------------+--------+
|           DL|16547870|
|           WN|15976022|
|           AA|14984647|
|           US|14075530|
|           UA|13299817|
|           NW|10292627|
|           CO| 8145788|
|           MQ| 3954895|
|           TW| 3757747|
|           HP| 3636682|
|           OO| 3090853|
|           AS| 2878021|
|           XE| 2350309|
|           EV| 1697172|
|           OH| 1464176|
|           FL| 1265138|
|           EA|  919785|
|           PI|  873957|
|           YV|  854056|
|           B6|  811341|
+-------------+--------+
only showing top 20 rows

CPU times: user 50 ms, sys: 10 ms, total: 60 ms
Wall time: 4min 6s
