**INITIALIZING**

In [0]:
# install java libs and spark.
! apt-get install openjdk-8-jdk-headless -qq > /dev/null
! wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
! tar xf spark-2.4.4-bin-hadoop2.7.tgz
! pip install -q findspark
!pip install pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"



**MOUNTING DRIVE**

In [0]:
# Point Colaboratory to Google Drive

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


DOWNLOADING DATASETS

In [0]:
# Download datasets directly to your Google Drive "Colab Datasets" folder

import requests

# ALL data

file_url = "http://www.rdatasciencecases.org/Data/Airline/1987.csv.bz2"

r = requests.get(file_url, stream = True) 

with open("/content/gdrive/My Drive/Colab_Dataset/2008.csv.bz2", "wb") as file: 
	for block in r.iter_content(chunk_size = 1024): 
		if block: 
			file.write(block)

**IMPORTING PYSPARK SESSION**

In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.functions import isnan, when, count, col
# Set up constants

APP_NAME = "Flight Delays"
SPARK_URL = "local[*]"

In [0]:
spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

**READING DATA FILES AND RETURNING RDD**

In [0]:
sc = spark.sparkContext
rdd = sc.textFile('/content/gdrive/My Drive/Colab_Dataset//*.csv.bz2')
rdd.take(10)

['Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay',
 '1987,10,14,3,741,730,912,849,PS,1451,NA,91,79,NA,23,11,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,15,4,729,730,903,849,PS,1451,NA,94,79,NA,14,-1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,17,6,741,730,918,849,PS,1451,NA,97,79,NA,29,11,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,18,7,729,730,847,849,PS,1451,NA,78,79,NA,-2,-1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,19,1,749,730,922,849,PS,1451,NA,93,79,NA,33,19,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,21,3,728,730,848,849,PS,1451,NA,80,79,NA,-1,-2,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,22,4,728,730,852,849,PS,1451,NA,84,79,NA,3,-2,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '19

**TOTAL NUMBER OF COUNTS**

In [0]:
rdd.count()

123534991

**REMOVING HEADER**

In [0]:
header = rdd.first()
rdd2=rdd.filter(lambda line: line != header)
rdd2.take(2)

['1987,10,14,3,741,730,912,849,PS,1451,NA,91,79,NA,23,11,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA',
 '1987,10,15,4,729,730,903,849,PS,1451,NA,94,79,NA,14,-1,SAN,SFO,447,NA,NA,0,NA,0,NA,NA,NA,NA,NA']

**SELECTING UNIQUECARRIERS**

In [0]:
rdd3= rdd2.map(lambda x: x.split(',')[8])


In [0]:
rdd3.take(10)

['PI', 'PI', 'PI', 'PI', 'PI', 'PI', 'PI', 'PI', 'PI', 'PI']

**IMPLEMETING MAPREDUCE**

In [0]:
rdd4=rdd3.map(lambda a:(a,1)).reduceByKey(lambda a,b: a+b)

In [19]:
rdd4.take(30)

[('MQ', 3954895),
 ('PI', 873957),
 ('XE', 2350309),
 ('AQ', 154381),
 ('AA', 14984647),
 ('PS', 83617),
 ('AS', 2878021),
 ('DL', 16547870),
 ('UA', 13299817),
 ('FL', 1265138),
 ('NW', 10292627),
 ('DH', 693047),
 ('EA', 919785),
 ('HA', 274265),
 ('F9', 336958),
 ('WN', 15976022),
 ('PA (1)', 316167),
 ('US', 14075530),
 ('HP', 3636682),
 ('CO', 8145788),
 ('OO', 3090853),
 ('TZ', 208420),
 ('B6', 811341),
 ('OH', 1464176),
 ('ML (1)', 70622),
 ('TW', 3757747),
 ('EV', 1697172),
 ('YV', 854056),
 ('9E', 521059)]