In [0]:
import time
import random
import numpy as np
import pandas as pd
import airportsdata
from itertools import chain
from datetime import datetime 
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row, Column
from pyspark.ml.feature import Imputer
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import SparseVector, DenseVector
from pyspark.sql.types import BooleanType, StringType, IntegerType, DoubleType, LongType
from pyspark.ml.feature import OneHotEncoder, StandardScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# custom configuration
sc = spark.sparkContext

In [0]:
## Logistic Regression

In [0]:
def get_logistic_regression(train_df, features='dev'):
    if features == 'prod':
        feature_col = 'pcaFeatures'
    else:
        feature_col = 'features'
    lr = LogisticRegression(featuresCol = feature_col, labelCol = label_cat, maxIter=10, regParam=0.3, elasticNetParam=0.8)
    # build the parameter grid for model tuning
    lr_paramGrid = ParamGridBuilder() \
              .addGrid(lr.regParam, [0.01, 0.1]) \
              .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
              .build()

    # execute TimeSeriesCrossValidator for model tuning
    lr_crossval = TimeSeriesCrossValidator(estimator=lr,
                              estimatorParamMaps=lr_paramGrid,
                              evaluator=BinaryClassificationEvaluator(labelCol=label_cat, 
                                                                      metricName='areaUnderROC'),
                              parallelism=3,
                              foldCol='foldCol',
                              numFolds=nFolds)

    # train the tuned model and establish our best model
    lr_cvModel = lr_crossval.fit(train_df)
    lr_model = lr_cvModel.bestModel
    return lr_model

In [0]:
def cvmodel_perf_summary(log_Model):
  # model evaluation https://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression
  trainingSummary = log_Model.summary
  accuracy = trainingSummary.accuracy
  falsePositiveRate = trainingSummary.weightedFalsePositiveRate
  truePositiveRate = trainingSummary.weightedTruePositiveRate
  fMeasure = trainingSummary.weightedFMeasure()
  precision = trainingSummary.weightedPrecision
  areaUnderROC = trainingSummary.areaUnderROC
  print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision))

In [0]:
## Linear regression

In [0]:
def get_linear_regression(train_df, features='dev'):
    if features == 'prod':
        feature_col = 'pcaFeatures'
    else:
        feature_col = 'features'
    lnr = LinearRegression(featuresCol = feature_col, labelCol = label_cont, maxIter=10, regParam=0.3, elasticNetParam=0.8)
    # build the parameter grid for model tuning
    lnr_paramGrid = ParamGridBuilder() \
              .addGrid(lnr.regParam, [0.01, 0.1]) \
              .addGrid(lnr.elasticNetParam, [0.0, 0.5, 1.0]) \
              .build()

    # execute TimeSeriesCrossValidator for model tuning
    lnr_crossval = TimeSeriesCrossValidator(estimator=lnr,
                              estimatorParamMaps=lnr_paramGrid,
                              evaluator=RegressionEvaluator(labelCol=label_cont, 
                                                                  metricName='r2'), 
                              parallelism=3,
                              foldCol='foldCol',
                              numFolds=nFolds)

    # train the tuned model and establish our best model
    lnr_cvModel = lnr_crossval.fit(train_df)
    lnr_model = lnr_cvModel.bestModel
    return lnr_model

In [0]:
def decisiontree_perf_summary(predictions):
    # Evaluation Metrics
    eval_ = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='DEP_DEL15')

    # Store performance metrics in a dictionary
    metrics = ['accuracy', 'weightedPrecision', 'weightedRecall', 'weightedFMeasure']
    results = {}

    # Loop through every performance metric and evaluate
    for m in metrics:
      if m in ['precisionByLabel', 'recallByLabel', 'fMeasureByLabel']:
        results[m] = [eval_.evaluate(predictions, {eval_.metricName: m, eval_.metricLabel:0.0}), 
                      eval_.evaluate(predictions, {eval_.metricName: m, eval_.metricLabel:1.0})]
      else:
        results[m] = eval_.evaluate(predictions, {eval_.metricName: m})

    # Print the results
    print('Performance metrics')
    print('------------------------------------------------------------------------------------------------')
    for x in results:
      print(f'{x}: {results[x]}')
    