In [0]:
import numpy as np
import pandas as pd
import plotly as px
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from datetime import datetime
import time

import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.functions import isnan, when, count, col

from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, StandardScaler, OneHotEncoder, SQLTransformer
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.sql import Window

from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import mlflow
import mlflow.spark

## Final Model Variants
After various iterations of model tuning, feature engineering/selection, and performance analysis, we have settled on the final set of features we'd like to use for our **Random Forest** model. In this notebook we explore a few different subsets of those features, as well as try some high level hyperparameter tuning.

### Goal 
Build a final model pipeline and evaluate on train and validation for final hyper parameter tuning.

In [0]:
train_data = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/project_data/train/part-00*.parquet")
val_data = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/project_data/validation/part-00*.parquet")

print("Train Data: ", train_data.count())
print("Validation Data: ", val_data.count())

**Note: `MINUTES_AFTER_MIDNIGHT_ORIGIN` and `MINUTES_AFTER_MIDNIGHT_DEST` are calculated in UTC time.**

In [0]:
final_features =  ['DELAYS_SO_FAR','MINUTES_AFTER_MIDNIGHT_ORIGIN','MINUTES_AFTER_MIDNIGHT_DEST', 
                   'NETWORK_CONGESTION','AVG_VIS_DIS_ORIGIN','DEST_PR','ORIGIN_PR', 'AVG_DEW_DEG_ORIGIN',
                   'CRS_ELAPSED_TIME', 'AVG_WND_SPEED_ORIGIN','AVG_WND_SPEED_DEST', 'QUARTER', 'DEP_HOUR_BIN',
                   'ARR_HOUR_BIN', 'IS_MORNING_FLIGHT','IS_EVENING_FLIGHT']

label_col = ['DEP_DEL15']
final_cols = list(set(final_features + label_col))

print("Number of features: ", len(final_features))
print(final_cols)

In [0]:
#Keep on Train
train_final = train_data.select(*final_cols)

#Keep on Val
val_final = val_data.select(*final_cols)

print("Remaining Col Count: ", len(train_final.columns))
train_final.printSchema()

### Model Variant #1

In [0]:
cat_cols = ['QUARTER','IS_MORNING_FLIGHT','IS_EVENING_FLIGHT']

num_cols = ['DELAYS_SO_FAR','MINUTES_AFTER_MIDNIGHT_ORIGIN','MINUTES_AFTER_MIDNIGHT_DEST', 
            'NETWORK_CONGESTION','AVG_VIS_DIS_ORIGIN','DEST_PR','ORIGIN_PR', 'AVG_DEW_DEG_ORIGIN',
            'CRS_ELAPSED_TIME', 'AVG_WND_SPEED_ORIGIN','AVG_WND_SPEED_DEST']

In [0]:
#Check groupings for cat cols
cols_by_cat = 0
for col in cat_cols:
    col_count = train_final.select(col).distinct().count()
    cols_by_cat += col_count
    print(col, ": ", col_count)

print()
tot_col_count = len(num_cols)+cols_by_cat
print("Number of columns after OHE: ", tot_col_count)

In [0]:
#Apply Balance Ratio, build this only from Train
balancing_ratio = train_final.filter("DEP_DEL15 == 0").count() / train_final.count()
print("Balancing Ratio =", balancing_ratio)

In [0]:
train_final = train_final.withColumn("CLASS_WEIGHTS", when(train_final.DEP_DEL15 == 1, balancing_ratio) \
                               .otherwise(1 - balancing_ratio))

val_final = val_final.withColumn("CLASS_WEIGHTS", when(val_final.DEP_DEL15 == 1, balancing_ratio) \
                               .otherwise(1 - balancing_ratio))

In [0]:
#Returns a Pandas DF with top features and scores
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [0]:
cols = list(set(cat_cols + num_cols + label_col)) + ["CLASS_WEIGHTS"]
train = train_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')
val = val_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')

print((train.count(), len(train.columns)))
print((val.count(), len(val.columns)))
display(train)

NETWORK_CONGESTION,label,DELAYS_SO_FAR,ORIGIN_PR,ARR_HOUR_BIN,AVG_WND_SPEED_ORIGIN,AVG_WND_SPEED_DEST,MINUTES_AFTER_MIDNIGHT_ORIGIN,DEST_PR,AVG_VIS_DIS_ORIGIN,MINUTES_AFTER_MIDNIGHT_DEST,AVG_DEW_DEG_ORIGIN,CRS_ELAPSED_TIME,DEP_HOUR_BIN,QUARTER,CLASS_WEIGHTS
4012.0,0.0,1.0,0.0012740992192146,3,26.0,34.75,330,0.0078058444034201,15288.5,380,156.0,50.0,3,1,0.1786318007205677
582.0,0.0,3.0,0.0012740992192146,3,18.0,14.25,363,0.0078058444034201,16093.0,412,86.5,49.0,3,1,0.1786318007205677
37440.0,0.0,0.0,0.0012740992192146,1,27.75,29.0,1090,0.0035629037678561,12874.5,1128,139.0,38.0,1,1,0.1786318007205677
3813.0,1.0,2.0,0.0005011645446733933,1,31.39158188310781,32.492063492063494,505,0.0171480249399023,15000.50264618144,691,83.85062887816409,186.0,0,1,0.8213681992794323
56430.0,0.0,0.0,0.0012740992192146,1,28.42105263157895,58.85714285714285,1090,0.0035629037678561,4378.294117647059,1128,131.11764705882354,38.0,1,1,0.1786318007205677
249040.0,0.0,1.0,0.0004492987497042213,3,35.714285714285715,59.458333333333336,1320,0.0239871068543374,16093.0,1409,-70.0,89.0,2,1,0.1786318007205677
86845.0,0.0,0.0,0.0004492987497042213,2,22.857142857142858,17.958333333333332,1245,0.0239871068543374,16093.0,1334,-10.0,89.0,2,1,0.1786318007205677
33580.0,0.0,2.0,0.0012740992192146,2,37.4,65.5,1389,0.0078058444034201,16093.0,2,160.33333333333334,53.0,2,1,0.1786318007205677
368.0,0.0,0.0,0.0012740992192146,3,53.142857142857146,70.0,120,0.0078058444034201,4826.0,172,171.2,52.0,3,1,0.1786318007205677
66102.0,0.0,0.0,0.0012740992192146,3,36.0,50.0,60,0.0078058444034201,16093.0,112,104.5,52.0,3,1,0.1786318007205677


In [0]:
col_vec_out = [x+'_catVec' for x in cat_cols]

#StringIndex into labelled indices
indexers = [StringIndexer(inputCol=x, outputCol= x+'_tmp') for x in cat_cols]

#OneHotEncoder, indices into sparse one hot encoded columns
encoders = [OneHotEncoder(dropLast=False, inputCol=x+'_tmp', outputCol=y) for x,y in zip(cat_cols, col_vec_out)]

#Create pair of zips
stages = [[i,j] for i,j in zip(indexers, encoders)]

#Flatten into stages
stages = [stage for sublist in stages for stage in sublist]

#Assemble all the features together into one feature vector
res_cols = num_cols + col_vec_out
vector_assembler = VectorAssembler(inputCols=res_cols, outputCol="features", handleInvalid='skip')
stages += [vector_assembler]

In [0]:
#Helper function to check and create a path if it doesn't exist already
def path_exists(path):
    print("Path: ", path)
    try:
        dbutils.fs.ls(path)
        return True
    except Exception as e:
        dbutils.fs.mkdirs(path)        
        return False

In [0]:
%%time

model_version = "rf_model_21"
nTree = 40
mDep = 15
subSamp = 1.0

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once, but a second run would hit conflicts when attempting to overwrite the first run.
with mlflow.start_run():

    start = time.time()
    pipeline = Pipeline().setStages(stages)
  
    #MISLEADING - No model fitting is happening here. Just creating the 1 hot encoded columns
    train_pip = pipeline.fit(train).transform(train)
    val_pip = pipeline.fit(val).transform(val)
    done = time.time()
    mlflow.log_metric('pip_fit_time' ,done - start)

    rf = RandomForestClassifier(labelCol="label", featuresCol="features", weightCol="CLASS_WEIGHTS",
                            numTrees=nTree, maxDepth=mDep, cacheNodeIds = True, subsamplingRate = subSamp)  
    mlflow.log_param('numTrees', nTree)
    mlflow.log_param('maxDepth', mDep)
    mlflow.log_param('cacheNodeIds', True)
    mlflow.log_param('subsamplingRate', subSamp)  

    start = time.time()
    rf_model = rf.fit(train_pip)
    done = time.time()
    mlflow.log_metric('model_fit_time' ,done - start)

    start = time.time()
    train_results = rf_model.transform(train_pip)
    done = time.time()
    mlflow.log_metric('train_pred_time' ,done - start)  

    start = time.time()
    val_results = rf_model.transform(val_pip)
    done = time.time()
    mlflow.log_metric('val_pred_time' ,done - start)

    #Set up BinClassEval
    evaluator = BinaryClassificationEvaluator()
    evaluator.setRawPredictionCol("raw")

    trainScoreAndLabels = train_results.select(['probability','label', f.col("prediction").alias("raw")])
    valScoreAndLabels = val_results.select(['probability','label', f.col("prediction").alias("raw")])

    print("Train Set")
    start=time.time()
    train_pr = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    train_roc = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('train_metric_time' ,done - start)  

    print("train areaUnderPR: ", train_pr)
    print("train areaUnderROC: ", train_roc)
    mlflow.log_metric('train_pr' ,train_pr)
    mlflow.log_metric('train_roc' ,train_roc)
    print()

    print("Validation Set")
    start =time.time()
    val_pr = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    val_roc = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('val_metric_time' ,done - start)    

    print("val areaUnderPR: ", val_pr)
    print("val areaUnderROC: ", val_roc)
    mlflow.log_metric('val_pr' , val_pr)
    mlflow.log_metric('val_roc' ,val_roc)  

    # Log this model.
    mlflow.spark.log_model(spark_model=rf_model, artifact_path=model_version)

    start =time.time()
    #Save the model out in case we need to reference again in the future
    rf_model.write().overwrite().save(f"dbfs:/mnt/mids-w261/team20SSDK/models/{model_version}")
    done = time.time()
    mlflow.log_metric('model_save_time' ,done - start)     

    start =time.time()
    #m3_train_results input is just used for Schema purposes
    varlist = ExtractFeatureImp(rf_model.featureImportances, val_results, "features")
    varlist["order"] = np.arange(1, len(varlist["idx"])+1)
    done = time.time()
    mlflow.log_metric('extract_feat_imp_time' ,done - start)   

    start =time.time()
    #Log these artifacts for graphs and charts later on
    trainScoreAndLabels_pd = trainScoreAndLabels.toPandas()
    valScoreAndLabels_pd = valScoreAndLabels.toPandas()
    
    #Create Path if it doesn't exist
    path_exists(f'dbfs:/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}')
    
    #Extract it all to evaluate later on.
    varlist.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv', index=False)
    valScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv', index=False)
    trainScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv',index=False)
    done = time.time()
    mlflow.log_metric('ouput_to_pd_time' , done - start)
    
    # Log the saved table as an artifact
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv')    
  

In [0]:
%%time

model_version = "rf_model_22"
nTree = 40
mDep = 15
subSamp = 1.0

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once, but a second run would hit conflicts when attempting to overwrite the first run.
with mlflow.start_run():

    start = time.time()
    pipeline = Pipeline().setStages(stages)
  
    #MISLEADING - No model fitting is happening here. Just creating the 1 hot encoded columns
    train_pip = pipeline.fit(train).transform(train)
    val_pip = pipeline.fit(val).transform(val)
    done = time.time()
    mlflow.log_metric('pip_fit_time' ,done - start)

    rf = RandomForestClassifier(labelCol="label", featuresCol="features", weightCol="CLASS_WEIGHTS",
                            numTrees=nTree, maxDepth=mDep, cacheNodeIds = True, subsamplingRate = subSamp)  
    mlflow.log_param('numTrees', nTree)
    mlflow.log_param('maxDepth', mDep)
    mlflow.log_param('cacheNodeIds', True)
    mlflow.log_param('subsamplingRate', subSamp)  

    start = time.time()
    rf_model = rf.fit(train_pip)
    done = time.time()
    mlflow.log_metric('model_fit_time' ,done - start)

    start = time.time()
    train_results = rf_model.transform(train_pip)
    done = time.time()
    mlflow.log_metric('train_pred_time' ,done - start)  

    start = time.time()
    val_results = rf_model.transform(val_pip)
    done = time.time()
    mlflow.log_metric('val_pred_time' ,done - start)

    #Set up BinClassEval
    evaluator = BinaryClassificationEvaluator()
    evaluator.setRawPredictionCol("raw")

    trainScoreAndLabels = train_results.select(['probability','label', f.col("prediction").alias("raw")])
    valScoreAndLabels = val_results.select(['probability','label', f.col("prediction").alias("raw")])

    print("Train Set")
    start=time.time()
    train_pr = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    train_roc = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('train_metric_time' ,done - start)  

    print("train areaUnderPR: ", train_pr)
    print("train areaUnderROC: ", train_roc)
    mlflow.log_metric('train_pr' ,train_pr)
    mlflow.log_metric('train_roc' ,train_roc)
    print()

    print("Validation Set")
    start =time.time()
    val_pr = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    val_roc = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('val_metric_time' ,done - start)    

    print("val areaUnderPR: ", val_pr)
    print("val areaUnderROC: ", val_roc)
    mlflow.log_metric('val_pr' , val_pr)
    mlflow.log_metric('val_roc' ,val_roc)  

    # Log this model.
    mlflow.spark.log_model(spark_model=rf_model, artifact_path=model_version)

    start =time.time()
    #Save the model out in case we need to reference again in the future
    rf_model.write().overwrite().save(f"dbfs:/mnt/mids-w261/team20SSDK/models/{model_version}")
    done = time.time()
    mlflow.log_metric('model_save_time' ,done - start)     

    start =time.time()
    #m3_train_results input is just used for Schema purposes
    varlist = ExtractFeatureImp(rf_model.featureImportances, val_results, "features")
    varlist["order"] = np.arange(1, len(varlist["idx"])+1)
    done = time.time()
    mlflow.log_metric('extract_feat_imp_time' ,done - start)   

    start =time.time()
    #Log these artifacts for graphs and charts later on
    trainScoreAndLabels_pd = trainScoreAndLabels.toPandas()
    valScoreAndLabels_pd = valScoreAndLabels.toPandas()
    
    #Create Path if it doesn't exist
    path_exists(f'dbfs:/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}')
    
    #Extract it all to evaluate later on.
    varlist.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv', index=False)
    valScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv', index=False)
    trainScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv',index=False)
    done = time.time()
    mlflow.log_metric('ouput_to_pd_time' , done - start)
    
    # Log the saved table as an artifact
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv')    
  

### Model Variant #2

In [0]:
cat_cols = ['QUARTER', 'DEP_HOUR_BIN', 'ARR_HOUR_BIN']

num_cols = ['DELAYS_SO_FAR','MINUTES_AFTER_MIDNIGHT_ORIGIN','MINUTES_AFTER_MIDNIGHT_DEST', 
            'NETWORK_CONGESTION','AVG_VIS_DIS_ORIGIN','DEST_PR','ORIGIN_PR', 'AVG_DEW_DEG_ORIGIN',
            'CRS_ELAPSED_TIME', 'AVG_WND_SPEED_ORIGIN','AVG_WND_SPEED_DEST']

In [0]:
cols = list(set(cat_cols + num_cols + label_col)) + ["CLASS_WEIGHTS"]
train = train_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')
val = val_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')

print((train.count(), len(train.columns)))
print((val.count(), len(val.columns)))
display(train)

In [0]:
col_vec_out = [x+'_catVec' for x in cat_cols]

#StringIndex into labelled indices
indexers = [StringIndexer(inputCol=x, outputCol= x+'_tmp') for x in cat_cols]

#OneHotEncoder, indices into sparse one hot encoded columns
encoders = [OneHotEncoder(dropLast=False, inputCol=x+'_tmp', outputCol=y) for x,y in zip(cat_cols, col_vec_out)]

#Create pair of zips
stages = [[i,j] for i,j in zip(indexers, encoders)]

#Flatten into stages
stages = [stage for sublist in stages for stage in sublist]

#Assemble all the features together into one feature vector
res_cols = num_cols + col_vec_out
vector_assembler = VectorAssembler(inputCols=res_cols, outputCol="features", handleInvalid='skip')
stages += [vector_assembler]

In [0]:
%%time

model_version = "rf_model_25"
nTree = 50
mDep = 18
subSamp = 1.0

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once, but a second run would hit conflicts when attempting to overwrite the first run.
with mlflow.start_run():

    start = time.time()
    pipeline = Pipeline().setStages(stages)
  
    #MISLEADING - No model fitting is happening here. Just creating the 1 hot encoded columns
    train_pip = pipeline.fit(train).transform(train)
    val_pip = pipeline.fit(val).transform(val)
    done = time.time()
    mlflow.log_metric('pip_fit_time' ,done - start)

    rf = RandomForestClassifier(labelCol="label", featuresCol="features", weightCol="CLASS_WEIGHTS",
                            numTrees=nTree, maxDepth=mDep, cacheNodeIds = True, subsamplingRate = subSamp)  
    mlflow.log_param('numTrees', nTree)
    mlflow.log_param('maxDepth', mDep)
    mlflow.log_param('cacheNodeIds', True)
    mlflow.log_param('subsamplingRate', subSamp)  

    start = time.time()
    rf_model = rf.fit(train_pip)
    done = time.time()
    mlflow.log_metric('model_fit_time' ,done - start)

    start = time.time()
    train_results = rf_model.transform(train_pip)
    done = time.time()
    mlflow.log_metric('train_pred_time' ,done - start)  

    start = time.time()
    val_results = rf_model.transform(val_pip)
    done = time.time()
    mlflow.log_metric('val_pred_time' ,done - start)

    #Set up BinClassEval
    evaluator = BinaryClassificationEvaluator()
    evaluator.setRawPredictionCol("raw")

    trainScoreAndLabels = train_results.select(['probability','label', f.col("prediction").alias("raw")])
    valScoreAndLabels = val_results.select(['probability','label', f.col("prediction").alias("raw")])

    print("Train Set")
    start=time.time()
    train_pr = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    train_roc = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('train_metric_time' ,done - start)  

    print("train areaUnderPR: ", train_pr)
    print("train areaUnderROC: ", train_roc)
    mlflow.log_metric('train_pr' ,train_pr)
    mlflow.log_metric('train_roc' ,train_roc)
    print()

    print("Validation Set")
    start =time.time()
    val_pr = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    val_roc = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('val_metric_time' ,done - start)    

    print("val areaUnderPR: ", val_pr)
    print("val areaUnderROC: ", val_roc)
    mlflow.log_metric('val_pr' , val_pr)
    mlflow.log_metric('val_roc' ,val_roc)  

    # Log this model.
    mlflow.spark.log_model(spark_model=rf_model, artifact_path=model_version)

    start =time.time()
    #Save the model out in case we need to reference again in the future
    rf_model.write().overwrite().save(f"dbfs:/mnt/mids-w261/team20SSDK/models/{model_version}")
    done = time.time()
    mlflow.log_metric('model_save_time' ,done - start)     

    start =time.time()
    #m3_train_results input is just used for Schema purposes
    varlist = ExtractFeatureImp(rf_model.featureImportances, val_results, "features")
    varlist["order"] = np.arange(1, len(varlist["idx"])+1)
    done = time.time()
    mlflow.log_metric('extract_feat_imp_time' ,done - start)   

    start =time.time()
    #Log these artifacts for graphs and charts later on
    trainScoreAndLabels_pd = trainScoreAndLabels.toPandas()
    valScoreAndLabels_pd = valScoreAndLabels.toPandas()
    
    #Create Path if it doesn't exist
    path_exists(f'dbfs:/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}')
    
    #Extract it all to evaluate later on.
    varlist.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv', index=False)
    valScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv', index=False)
    trainScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv',index=False)
    done = time.time()
    mlflow.log_metric('ouput_to_pd_time' , done - start)
    
    # Log the saved table as an artifact
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv')    
  

### Model Variant #3

##### New Feature - IS_EARLY_MORN_FLIGHT

For our business case, we would like to be more conservative in terms of predicting delay. i.e. In all cases we would prefer telling a customer that there is no delay, evven if there ends up being one. Therefore we want to minimize **False Positives (FP)**. When performing this analysis on our latest models the following [notebook] we found that there is a spike in **FP**'s for early morning flights, i.e. when `MINUTES_AFTER_MIDNIGHT_ORIGIN` is less than **180**. Therefore we wanted to address this pitfall of our model, by adding a special boolean term, that indicates whether or not it is an early morning flight.

[notebook]: https://dbc-c4580dc0-018b.cloud.databricks.com/?o=8229810859276230#notebook/2834511320022313/command/2834511320025772

In [0]:
train_final = train_final.withColumn("IS_EARLY_MORNING_FLIGHT", when(f.col("MINUTES_AFTER_MIDNIGHT_ORIGIN") <= 180,1).otherwise(0))
val_final = val_final.withColumn("IS_EARLY_MORNING_FLIGHT", when(f.col("MINUTES_AFTER_MIDNIGHT_ORIGIN") <= 180,1).otherwise(0))

In [0]:
cat_cols = ['QUARTER', 'IS_EARLY_MORNING_FLIGHT']

num_cols = ['DELAYS_SO_FAR','MINUTES_AFTER_MIDNIGHT_ORIGIN','MINUTES_AFTER_MIDNIGHT_DEST', 
            'NETWORK_CONGESTION','AVG_VIS_DIS_ORIGIN','DEST_PR','ORIGIN_PR', 'AVG_DEW_DEG_ORIGIN',
            'CRS_ELAPSED_TIME', 'AVG_WND_SPEED_ORIGIN','AVG_WND_SPEED_DEST']

In [0]:
train_final = train_final.withColumn("CLASS_WEIGHTS", when(train_final.DEP_DEL15 == 1, balancing_ratio) \
                               .otherwise(1 - balancing_ratio))

val_final = val_final.withColumn("CLASS_WEIGHTS", when(val_final.DEP_DEL15 == 1, balancing_ratio) \
                               .otherwise(1 - balancing_ratio))

In [0]:
cols = list(set(cat_cols + num_cols + label_col)) + ["CLASS_WEIGHTS"]
train = train_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')
val = val_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')

print((train.count(), len(train.columns)))
print((val.count(), len(val.columns)))
display(train)

NETWORK_CONGESTION,label,DELAYS_SO_FAR,ORIGIN_PR,AVG_WND_SPEED_ORIGIN,AVG_WND_SPEED_DEST,MINUTES_AFTER_MIDNIGHT_ORIGIN,DEST_PR,IS_EARLY_MORNING_FLIGHT,AVG_VIS_DIS_ORIGIN,MINUTES_AFTER_MIDNIGHT_DEST,AVG_DEW_DEG_ORIGIN,CRS_ELAPSED_TIME,QUARTER,CLASS_WEIGHTS
4012.0,0.0,1.0,0.0012740992192146,26.0,34.75,330,0.0078058444034201,0,15288.5,380,156.0,50.0,1,0.1786318007205677
582.0,0.0,3.0,0.0012740992192146,18.0,14.25,363,0.0078058444034201,0,16093.0,412,86.5,49.0,1,0.1786318007205677
37440.0,0.0,0.0,0.0012740992192146,27.75,29.0,1090,0.0035629037678561,0,12874.5,1128,139.0,38.0,1,0.1786318007205677
3813.0,1.0,2.0,0.0005011645446733933,31.39158188310781,32.492063492063494,505,0.0171480249399023,0,15000.50264618144,691,83.85062887816409,186.0,1,0.8213681992794323
56430.0,0.0,0.0,0.0012740992192146,28.42105263157895,58.85714285714285,1090,0.0035629037678561,0,4378.294117647059,1128,131.11764705882354,38.0,1,0.1786318007205677
249040.0,0.0,1.0,0.0004492987497042213,35.714285714285715,59.458333333333336,1320,0.0239871068543374,0,16093.0,1409,-70.0,89.0,1,0.1786318007205677
86845.0,0.0,0.0,0.0004492987497042213,22.857142857142858,17.958333333333332,1245,0.0239871068543374,0,16093.0,1334,-10.0,89.0,1,0.1786318007205677
33580.0,0.0,2.0,0.0012740992192146,37.4,65.5,1389,0.0078058444034201,0,16093.0,2,160.33333333333334,53.0,1,0.1786318007205677
368.0,0.0,0.0,0.0012740992192146,53.142857142857146,70.0,120,0.0078058444034201,1,4826.0,172,171.2,52.0,1,0.1786318007205677
66102.0,0.0,0.0,0.0012740992192146,36.0,50.0,60,0.0078058444034201,1,16093.0,112,104.5,52.0,1,0.1786318007205677


In [0]:
#Check groupings for cat cols
cols_by_cat = 0
for col in cat_cols:
    col_count = train_final.select(col).distinct().count()
    cols_by_cat += col_count
    print(col, ": ", col_count)

print()
tot_col_count = len(num_cols)+cols_by_cat
print("Number of columns after OHE: ", tot_col_count)

In [0]:
col_vec_out = [x+'_catVec' for x in cat_cols]

#StringIndex into labelled indices
indexers = [StringIndexer(inputCol=x, outputCol= x+'_tmp') for x in cat_cols]

#OneHotEncoder, indices into sparse one hot encoded columns
encoders = [OneHotEncoder(dropLast=False, inputCol=x+'_tmp', outputCol=y) for x,y in zip(cat_cols, col_vec_out)]

#Create pair of zips
stages = [[i,j] for i,j in zip(indexers, encoders)]

#Flatten into stages
stages = [stage for sublist in stages for stage in sublist]

#Assemble all the features together into one feature vector
res_cols = num_cols + col_vec_out
vector_assembler = VectorAssembler(inputCols=res_cols, outputCol="features", handleInvalid='skip')
stages += [vector_assembler]

In [0]:
%%time

model_version = "rf_model_26"
nTree = 40
mDep = 15
subSamp = 1.0

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once, but a second run would hit conflicts when attempting to overwrite the first run.
with mlflow.start_run():

    start = time.time()
    pipeline = Pipeline().setStages(stages)
  
    #MISLEADING - No model fitting is happening here. Just creating the 1 hot encoded columns
    train_pip = pipeline.fit(train).transform(train)
    val_pip = pipeline.fit(val).transform(val)
    done = time.time()
    mlflow.log_metric('pip_fit_time' ,done - start)

    rf = RandomForestClassifier(labelCol="label", featuresCol="features", weightCol="CLASS_WEIGHTS",
                            numTrees=nTree, maxDepth=mDep, cacheNodeIds = True, subsamplingRate = subSamp)  
    mlflow.log_param('numTrees', nTree)
    mlflow.log_param('maxDepth', mDep)
    mlflow.log_param('cacheNodeIds', True)
    mlflow.log_param('subsamplingRate', subSamp)  

    start = time.time()
    rf_model = rf.fit(train_pip)
    done = time.time()
    mlflow.log_metric('model_fit_time' ,done - start)

    start = time.time()
    train_results = rf_model.transform(train_pip)
    done = time.time()
    mlflow.log_metric('train_pred_time' ,done - start)  

    start = time.time()
    val_results = rf_model.transform(val_pip)
    done = time.time()
    mlflow.log_metric('val_pred_time' ,done - start)

    #Set up BinClassEval
    evaluator = BinaryClassificationEvaluator()
    evaluator.setRawPredictionCol("raw")

    trainScoreAndLabels = train_results.select(['probability','label', f.col("prediction").alias("raw")])
    valScoreAndLabels = val_results.select(['probability','label', f.col("prediction").alias("raw")])

    print("Train Set")
    start=time.time()
    train_pr = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    train_roc = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('train_metric_time' ,done - start)  

    print("train areaUnderPR: ", train_pr)
    print("train areaUnderROC: ", train_roc)
    mlflow.log_metric('train_pr' ,train_pr)
    mlflow.log_metric('train_roc' ,train_roc)
    print()

    print("Validation Set")
    start =time.time()
    val_pr = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    val_roc = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('val_metric_time' ,done - start)    

    print("val areaUnderPR: ", val_pr)
    print("val areaUnderROC: ", val_roc)
    mlflow.log_metric('val_pr' , val_pr)
    mlflow.log_metric('val_roc' ,val_roc)  

    # Log this model.
    mlflow.spark.log_model(spark_model=rf_model, artifact_path=model_version)

    start =time.time()
    #Save the model out in case we need to reference again in the future
    rf_model.write().overwrite().save(f"dbfs:/mnt/mids-w261/team20SSDK/models/{model_version}")
    done = time.time()
    mlflow.log_metric('model_save_time' ,done - start)     

    start =time.time()
    #m3_train_results input is just used for Schema purposes
    varlist = ExtractFeatureImp(rf_model.featureImportances, val_results, "features")
    varlist["order"] = np.arange(1, len(varlist["idx"])+1)
    done = time.time()
    mlflow.log_metric('extract_feat_imp_time' ,done - start)   

    start =time.time()
    #Log these artifacts for graphs and charts later on
    trainScoreAndLabels_pd = trainScoreAndLabels.toPandas()
    valScoreAndLabels_pd = valScoreAndLabels.toPandas()
    
    #Create Path if it doesn't exist
    path_exists(f'dbfs:/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}')
    
    #Extract it all to evaluate later on.
    varlist.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv', index=False)
    valScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv', index=False)
    trainScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv',index=False)
    done = time.time()
    mlflow.log_metric('ouput_to_pd_time' , done - start)
    
    # Log the saved table as an artifact
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv')    
  

### Model Variant #4

In [0]:
cat_cols = ['QUARTER', 'IS_EARLY_MORNING_FLIGHT','DEP_HOUR_BIN','ARR_HOUR_BIN']

num_cols = ['DELAYS_SO_FAR','MINUTES_AFTER_MIDNIGHT_ORIGIN','MINUTES_AFTER_MIDNIGHT_DEST', 
            'NETWORK_CONGESTION','AVG_VIS_DIS_ORIGIN','DEST_PR','ORIGIN_PR', 'AVG_DEW_DEG_ORIGIN',
            'CRS_ELAPSED_TIME', 'AVG_WND_SPEED_ORIGIN','AVG_WND_SPEED_DEST']

In [0]:
cols = list(set(cat_cols + num_cols + label_col)) + ["CLASS_WEIGHTS"]
train = train_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')
val = val_final.select(*cols).withColumnRenamed('DEP_DEL15', 'label')

print((train.count(), len(train.columns)))
print((val.count(), len(val.columns)))
display(train)

NETWORK_CONGESTION,label,DELAYS_SO_FAR,ORIGIN_PR,ARR_HOUR_BIN,AVG_WND_SPEED_ORIGIN,AVG_WND_SPEED_DEST,MINUTES_AFTER_MIDNIGHT_ORIGIN,DEST_PR,IS_EARLY_MORNING_FLIGHT,AVG_VIS_DIS_ORIGIN,MINUTES_AFTER_MIDNIGHT_DEST,AVG_DEW_DEG_ORIGIN,CRS_ELAPSED_TIME,DEP_HOUR_BIN,QUARTER,CLASS_WEIGHTS
4012.0,0.0,1.0,0.0012740992192146,3,26.0,34.75,330,0.0078058444034201,0,15288.5,380,156.0,50.0,3,1,0.1786318007205677
582.0,0.0,3.0,0.0012740992192146,3,18.0,14.25,363,0.0078058444034201,0,16093.0,412,86.5,49.0,3,1,0.1786318007205677
37440.0,0.0,0.0,0.0012740992192146,1,27.75,29.0,1090,0.0035629037678561,0,12874.5,1128,139.0,38.0,1,1,0.1786318007205677
3813.0,1.0,2.0,0.0005011645446733933,1,31.39158188310781,32.492063492063494,505,0.0171480249399023,0,15000.50264618144,691,83.85062887816409,186.0,0,1,0.8213681992794323
56430.0,0.0,0.0,0.0012740992192146,1,28.42105263157895,58.85714285714285,1090,0.0035629037678561,0,4378.294117647059,1128,131.11764705882354,38.0,1,1,0.1786318007205677
249040.0,0.0,1.0,0.0004492987497042213,3,35.714285714285715,59.458333333333336,1320,0.0239871068543374,0,16093.0,1409,-70.0,89.0,2,1,0.1786318007205677
86845.0,0.0,0.0,0.0004492987497042213,2,22.857142857142858,17.958333333333332,1245,0.0239871068543374,0,16093.0,1334,-10.0,89.0,2,1,0.1786318007205677
33580.0,0.0,2.0,0.0012740992192146,2,37.4,65.5,1389,0.0078058444034201,0,16093.0,2,160.33333333333334,53.0,2,1,0.1786318007205677
368.0,0.0,0.0,0.0012740992192146,3,53.142857142857146,70.0,120,0.0078058444034201,1,4826.0,172,171.2,52.0,3,1,0.1786318007205677
66102.0,0.0,0.0,0.0012740992192146,3,36.0,50.0,60,0.0078058444034201,1,16093.0,112,104.5,52.0,3,1,0.1786318007205677


In [0]:
#Check groupings for cat cols
cols_by_cat = 0
for col in cat_cols:
    col_count = train_final.select(col).distinct().count()
    cols_by_cat += col_count
    print(col, ": ", col_count)

print()
tot_col_count = len(num_cols)+cols_by_cat
print("Number of columns after OHE: ", tot_col_count)

In [0]:
col_vec_out = [x+'_catVec' for x in cat_cols]

#StringIndex into labelled indices
indexers = [StringIndexer(inputCol=x, outputCol= x+'_tmp') for x in cat_cols]

#OneHotEncoder, indices into sparse one hot encoded columns
encoders = [OneHotEncoder(dropLast=False, inputCol=x+'_tmp', outputCol=y) for x,y in zip(cat_cols, col_vec_out)]

#Create pair of zips
stages = [[i,j] for i,j in zip(indexers, encoders)]

#Flatten into stages
stages = [stage for sublist in stages for stage in sublist]

#Assemble all the features together into one feature vector
res_cols = num_cols + col_vec_out
vector_assembler = VectorAssembler(inputCols=res_cols, outputCol="features", handleInvalid='skip')
stages += [vector_assembler]

In [0]:
%%time

model_version = "rf_model_27"
nTree = 40
mDep = 15
subSamp = 1.0

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once, but a second run would hit conflicts when attempting to overwrite the first run.
with mlflow.start_run():

    start = time.time()
    pipeline = Pipeline().setStages(stages)
  
    #MISLEADING - No model fitting is happening here. Just creating the 1 hot encoded columns
    train_pip = pipeline.fit(train).transform(train)
    val_pip = pipeline.fit(val).transform(val)
    done = time.time()
    mlflow.log_metric('pip_fit_time' ,done - start)

    rf = RandomForestClassifier(labelCol="label", featuresCol="features", weightCol="CLASS_WEIGHTS",
                            numTrees=nTree, maxDepth=mDep, cacheNodeIds = True, subsamplingRate = subSamp)  
    mlflow.log_param('numTrees', nTree)
    mlflow.log_param('maxDepth', mDep)
    mlflow.log_param('cacheNodeIds', True)
    mlflow.log_param('subsamplingRate', subSamp)  

    start = time.time()
    rf_model = rf.fit(train_pip)
    done = time.time()
    mlflow.log_metric('model_fit_time' ,done - start)

    start = time.time()
    train_results = rf_model.transform(train_pip)
    done = time.time()
    mlflow.log_metric('train_pred_time' ,done - start)  

    start = time.time()
    val_results = rf_model.transform(val_pip)
    done = time.time()
    mlflow.log_metric('val_pred_time' ,done - start)

    #Set up BinClassEval
    evaluator = BinaryClassificationEvaluator()
    evaluator.setRawPredictionCol("raw")

    trainScoreAndLabels = train_results.select(['probability','label', f.col("prediction").alias("raw")])
    valScoreAndLabels = val_results.select(['probability','label', f.col("prediction").alias("raw")])

    print("Train Set")
    start=time.time()
    train_pr = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    train_roc = evaluator.evaluate(trainScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('train_metric_time' ,done - start)  

    print("train areaUnderPR: ", train_pr)
    print("train areaUnderROC: ", train_roc)
    mlflow.log_metric('train_pr' ,train_pr)
    mlflow.log_metric('train_roc' ,train_roc)
    print()

    print("Validation Set")
    start =time.time()
    val_pr = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderPR"})
    val_roc = evaluator.evaluate(valScoreAndLabels, {evaluator.metricName: "areaUnderROC"})
    done = time.time()
    mlflow.log_metric('val_metric_time' ,done - start)    

    print("val areaUnderPR: ", val_pr)
    print("val areaUnderROC: ", val_roc)
    mlflow.log_metric('val_pr' , val_pr)
    mlflow.log_metric('val_roc' ,val_roc)  

    # Log this model.
    mlflow.spark.log_model(spark_model=rf_model, artifact_path=model_version)

    start =time.time()
    #Save the model out in case we need to reference again in the future
    rf_model.write().overwrite().save(f"dbfs:/mnt/mids-w261/team20SSDK/models/{model_version}")
    done = time.time()
    mlflow.log_metric('model_save_time' ,done - start)     

    start =time.time()
    #m3_train_results input is just used for Schema purposes
    varlist = ExtractFeatureImp(rf_model.featureImportances, val_results, "features")
    varlist["order"] = np.arange(1, len(varlist["idx"])+1)
    done = time.time()
    mlflow.log_metric('extract_feat_imp_time' ,done - start)   

    start =time.time()
    #Log these artifacts for graphs and charts later on
    trainScoreAndLabels_pd = trainScoreAndLabels.toPandas()
    valScoreAndLabels_pd = valScoreAndLabels.toPandas()
    
    #Create Path if it doesn't exist
    path_exists(f'dbfs:/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}')
    
    #Extract it all to evaluate later on.
    varlist.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv', index=False)
    valScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv', index=False)
    trainScoreAndLabels_pd.to_csv(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv',index=False)
    done = time.time()
    mlflow.log_metric('ouput_to_pd_time' , done - start)
    
    # Log the saved table as an artifact
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/feat_imp.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/val_pred.csv')
    mlflow.log_artifact(f'/dbfs/mnt/mids-w261/team20SSDK/models/model_meta/{model_version}/train_pred.csv')    
  