## This notebook is part of Hadoop and Spark training delivered by IT-DB group
### SPARK MLlib Hands-On Lab
_ by Prasanth Kothuri _

 ### Import the required libraries

In [1]:
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.tree import DecisionTreeModel
from pyspark.mllib.util import MLUtils
from collections import namedtuple
from pprint import pprint

### create spark context

In [2]:
sc = SparkContext()

### define column mapping function

In [3]:
def map_record(record):
  columns = record.split(",")[:17]
  return flight(*columns)

### schema for named tuple

In [4]:
flight = namedtuple('flight',['dofM', 'dofW', 'carrier', 'tailnum', 'flnum', 'org_id', 'origin', 'dest_id', 'dest', 'crsdeptime', 'deptime', 'depdelaymins', 'crsarrtime', 'arrtime', 'arrdelay', 'crselapsedtime', 'dist'])

### load and map flight data

In [5]:
rdd = sc.textFile("../data/flightdata.csv")
flights_rdd = rdd.map(map_record).cache()
flights_rdd.take(2)

[flight(dofM=u'1', dofW=u'7', carrier=u'"AA"', tailnum=u'"N3CGAA"', flnum=u'"307"', org_id=u'11292', origin=u'"DEN"', dest_id=u'14107', dest=u'"PHX"', crsdeptime=u'"1145"', deptime=u'"1135"', depdelaymins=u'0.00', crsarrtime=u'"1345"', arrtime=u'"1328"', arrdelay=u'-17.00', crselapsedtime=u'120.00', dist=u'602.00'),
 flight(dofM=u'1', dofW=u'7', carrier=u'"AA"', tailnum=u'"N3CGAA"', flnum=u'"307"', org_id=u'14107', origin=u'"PHX"', dest_id=u'14057', dest=u'"PDX"', crsdeptime=u'"1510"', deptime=u'"1502"', depdelaymins=u'0.00', crsarrtime=u'"1701"', arrtime=u'"1653"', arrdelay=u'-8.00', crselapsedtime=u'171.00', dist=u'1009.00')]

### convert string data to numeric
_ define dict mapping for  carrier, origin and destination _

In [6]:
carrierMap = {}
for i in flights_rdd.map(lambda flight : flight.carrier.strip('"')).distinct().collect():
    carrierMap[i] = len(carrierMap)+1

In [7]:
originMap = {}
for i in flights_rdd.map(lambda flight : flight.origin.strip('"')).distinct().collect():
    originMap[i] = len(originMap)+1

In [8]:
destMap = {}
for i in flights_rdd.map(lambda flight : flight.dest.strip('"')).distinct().collect():
    destMap[i] = len(destMap)+1

In [9]:
pprint(destMap)

{u'ABE': 280,
 u'ABI': 281,
 u'ABQ': 283,
 u'ABR': 138,
 u'ABY': 288,
 u'ACT': 234,
 u'ACV': 192,
 u'ACY': 80,
 u'ADK': 252,
 u'ADQ': 275,
 u'AEX': 196,
 u'AGS': 10,
 u'ALB': 99,
 u'AMA': 42,
 u'ANC': 220,
 u'APN': 32,
 u'ASE': 87,
 u'ATL': 77,
 u'ATW': 232,
 u'AUS': 91,
 u'AVL': 67,
 u'AVP': 64,
 u'AZO': 262,
 u'BDL': 210,
 u'BET': 128,
 u'BFL': 289,
 u'BGM': 256,
 u'BHM': 145,
 u'BIL': 119,
 u'BIS': 277,
 u'BJI': 100,
 u'BLI': 71,
 u'BMI': 204,
 u'BNA': 25,
 u'BOI': 161,
 u'BOS': 159,
 u'BPT': 268,
 u'BQK': 177,
 u'BQN': 18,
 u'BRD': 261,
 u'BRO': 101,
 u'BRW': 30,
 u'BTM': 137,
 u'BTR': 255,
 u'BTV': 218,
 u'BUF': 49,
 u'BUR': 60,
 u'BWI': 170,
 u'BZN': 235,
 u'CAE': 117,
 u'CAK': 95,
 u'CDC': 254,
 u'CDV': 102,
 u'CHA': 173,
 u'CHO': 181,
 u'CHS': 180,
 u'CID': 238,
 u'CIU': 116,
 u'CLE': 245,
 u'CLL': 110,
 u'CLT': 5,
 u'CMH': 195,
 u'CMX': 264,
 u'COD': 233,
 u'COS': 75,
 u'CPR': 9,
 u'CRP': 35,
 u'CRW': 194,
 u'CSG': 55,
 u'CVG': 269,
 u'CWA': 140,
 u'DAB': 146,
 u'DAL': 45,
 u'

### create delayed label ###

In [10]:
def delayed(record):
  if record:
        if float(record) > 40:
            return 1.0
        else:
            return 0.0
  else:
    return 0.0

### build the features rdd

In [11]:
features_rdd = flights_rdd.map(lambda flight : [delayed(flight.depdelaymins), float(int(flight.dofM)), float(int(flight.dofW)), float(int(flight.crsdeptime.strip('"'))), float(int(flight.crsarrtime.strip('"'))), float(carrierMap[flight.carrier.strip('"')]), float(flight.crselapsedtime.strip('"')) if flight.crselapsedtime else 0.0, float(originMap[flight.origin.strip('"')]), float(destMap[flight.dest.strip('"')])])

In [12]:
mldata = features_rdd.map(lambda x : LabeledPoint(x[0], Vectors.dense(x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8])))
#mldata.take(5)

### split the data into training and test

In [13]:
mldata0 = mldata.filter(lambda x : x.label == 0).randomSplit([0.85, 0.15])[1]
mldata1 = mldata.filter(lambda x : x.label != 0)
mldata2 = mldata0 + mldata1
splits = mldata2.randomSplit([0.7, 0.3])
(trainingData, testData) = (splits[0], splits[1])

testData.take(1)

[LabeledPoint(0.0, [1.0,7.0,2129.0,2344.0,1.0,195.0,2.0,107.0])]

### build (train) the model

In [14]:
model = DecisionTree.trainClassifier(trainingData, 2, {}, "gini", 9, 7000)

In [15]:
model.toDebugString

<bound method DecisionTreeModel.toDebugString of DecisionTreeModel classifier of depth 9 with 889 nodes>

### test the model

In [16]:
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

### What is the accuracy?

In [17]:
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.330051555997


### build (train) the model

In [18]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
model = RandomForest.trainClassifier(trainingData, 2, {}, 20, "auto", "gini", 9, 7000)

In [19]:
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.327194235667
