In [2]:
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer,VectorAssembler}
import org.apache.spark.sql.functions.udf
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.functions._
import org.apache.spark.mllib.evaluation.MulticlassMetrics
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

Data Pre-Processing
------------------

In [3]:
val path = "nassCDS.csv"
val readData = spark.read.option("header","true").option("inferSchema","true").option("sep", ",").csv(path)
val dataWithId  = readData.withColumnRenamed("_c0","row_id")
val cleanData1 = dataWithId.na.drop
val dvcatConvert = udf { (x: String) => 
    if (x == "24-Oct") "10-24"
    else if (x == "1-9km/h" ) "1-9"
    else x
}
val cleanData2 = cleanData1.withColumn("dvcatConverted" , dvcatConvert(cleanData1("dvcat")))
val filterConvert = udf { (x: String) => 
    if (x == "NA") ""
    else if (x != null ) x
    else ""
}
val cleanData3 = cleanData2.withColumn("yearVehFilter" , filterConvert(cleanData2("yearVeh")))
val cleanData4 = cleanData3.withColumn("injSeverityFilter" , filterConvert(cleanData3("injSeverity")))

val cleanData5 = cleanData4.na.drop

In [3]:
val dvcatIndexer = new StringIndexer().setInputCol("dvcatConverted").setOutputCol("DvcatIndex")
val dvcatIndexed = dvcatIndexer.fit(cleanData5).transform(cleanData5)

val deadIndexer = new StringIndexer().setInputCol("dead").setOutputCol("label")
val deadIndexed = deadIndexer.fit(dvcatIndexed).transform(dvcatIndexed)

val airbagInder = new StringIndexer().setInputCol("airbag").setOutputCol("airbagIndex")
val airbagIndexed = airbagInder.fit(deadIndexed).transform(deadIndexed)

val seatbeltIndexer = new StringIndexer().setInputCol("seatbelt").setOutputCol("seatbeltIndex")
val seatbeltIndexed = seatbeltIndexer.fit(airbagIndexed).transform(airbagIndexed)

val sexIndexer = new StringIndexer().setInputCol("sex").setOutputCol("sexIndex")
val sexIndexed = sexIndexer.fit(seatbeltIndexed).transform(seatbeltIndexed)

val abcatIndexer = new StringIndexer().setInputCol("abcat").setOutputCol("abcatIndex") 
val abcatIndexed = abcatIndexer.fit(sexIndexed).transform(sexIndexed)

val occRoleIndexer = new StringIndexer().setInputCol("occRole").setOutputCol("occRoleIndex")
val occRoleIndexed = occRoleIndexer.fit(abcatIndexed).transform(abcatIndexed)

val yearVehIndexer = new StringIndexer().setInputCol("yearVeh").setOutputCol("yearVehIndex")
val yearVehIndexed = yearVehIndexer.fit(occRoleIndexed).transform(occRoleIndexed)

val injSeverityIndexer = new StringIndexer().setInputCol("injSeverity").setOutputCol("injSeverityIndex")
val injSeverityIndexed = injSeverityIndexer.fit(yearVehIndexed).transform(yearVehIndexed)

In [4]:
val dvcatEncoder = new OneHotEncoder().setInputCol("DvcatIndex").setOutputCol("DvcatVec")
val dvcatEncoded = dvcatEncoder.transform(injSeverityIndexed)

val airbagEncoder = new OneHotEncoder().setInputCol("airbagIndex").setOutputCol("airbagVec")
val airbagEncoded = airbagEncoder.transform(dvcatEncoded)

val seatbeltEncoder = new OneHotEncoder().setInputCol("seatbeltIndex").setOutputCol("seatbeltVec")
val seatbeltEncoded = seatbeltEncoder.transform(airbagEncoded)

val sexEncoder = new OneHotEncoder().setInputCol("sexIndex").setOutputCol("sexVec")
val sexEncoded = sexEncoder.transform(seatbeltEncoded)

val abcatEncoder = new OneHotEncoder().setInputCol("abcatIndex").setOutputCol("abcatVec")
val abcatEncoded = abcatEncoder.transform(sexEncoded)

val occRoleEncoder = new OneHotEncoder().setInputCol("occRoleIndex").setOutputCol("occRoleVec")
val occRoleEncoded = occRoleEncoder.transform(abcatEncoded)

val yearVehEncoder = new OneHotEncoder().setInputCol("yearVehIndex").setOutputCol("yearVehVec")
val yearVehEncoded = yearVehEncoder.transform(occRoleEncoded)

In [5]:
val vecAssembler = new VectorAssembler().
    setInputCols(Array("DvcatVec","weight","airbagVec","seatbeltVec","frontal","sexVec","ageOFocc","yearacc",
    "yearVehVec","abcatVec","occRoleVec","deploy","injSeverityIndex")).
    setOutputCol("features")
val vecAssembled = vecAssembler.transform(yearVehEncoded)
val Array(train, test) = vecAssembled.randomSplit(Array(0.7, 0.3),seed=1)

Logistic Regression
-------------------

In [14]:
import org.apache.spark.ml.classification.LogisticRegression
var startTime_lr = System.currentTimeMillis()
val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("features").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
val predictionDF_lr = lr.fit(train).transform(test)
var endTime_lr = System.currentTimeMillis()

In [15]:
//Model Evaluation
val predictionAndLabels_MCM_lr = predictionDF_lr.select("prediction","label").as[(Double,Double)].rdd
val metrics_lr = new MulticlassMetrics(predictionAndLabels_MCM_lr)
val predictionLabelsRDD_lr = predictionDF_lr.select("prediction", "label").map(r => (r.getDouble(0), r.getDouble(1)))
val bMetrics_lr = new BinaryClassificationMetrics(predictionLabelsRDD_lr.rdd)
println("Confusion Matrix of Logistics Regression is")
println(metrics_lr.confusionMatrix)
println("\nThe Accuracy of Logistics Regression is")
println(metrics_lr.accuracy)
println("\nThe ROC for Logistics Regression is")
println(bMetrics_lr.roc.collect().foreach(print))
print("\n")
println("\nThe Area under ROC for Logistics Regression is")
println(bMetrics_lr.areaUnderROC)
println("\nThe Precision of Logistics Regression is")
println(metrics_lr.precision)
println("\nThe Precision by Threshold for Logistics Regression is")
println(bMetrics_lr.precisionByThreshold.collect().foreach(print))
println("\nExecution Time for Training and Prediction of Logistics Regression is")
println(endTime_lr-startTime_lr + "ms")

Confusion Matrix of Logistics Regression is
7477.0  0.0  
338.0   0.0  

The Accuracy of Logistics Regression is
0.9567498400511836

The ROC for Logistics Regression is
(0.0,0.0)(1.0,1.0)(1.0,1.0)()

The Area under ROC for Logistics Regression is
0.5

The Precision of Logistics Regression is
0.9567498400511836

The Precision by Threshold for Logistics Regression is
(0.0,0.043250159948816376)()

Execution Time for Training and Prediction of Logistics Regression is
1579ms


### Analysis

Initial impression with the above results were satisfactory under the assumption that it is possible to get such a high accuracy rate as we have only around 26000 records. A deeper analysis with confusion matrix, very low false positive rates and a really good ROC curve gave a feeling that it is actually over-fitting the data for some reason. Even after changing the seed value for train-test data split, the model was consistantly giving very high performance  and that strengthened the doubts.

Decision Tree Classifier
------------------------

In [7]:
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
var startTime_dt = System.currentTimeMillis()
val dt = new DecisionTreeClassifier().setLabelCol("label").setFeaturesCol("features")
val predictionDF_dt = dt.fit(train).transform(test)
var endTime_dt = System.currentTimeMillis()

In [8]:
//Model Evaluation
val predictionAndLabels_MCM_dt = predictionDF_dt.select("prediction","label").as[(Double,Double)].rdd
val metrics_dt = new MulticlassMetrics(predictionAndLabels_MCM_dt)
val predictionLabelsRDD_dt = predictionDF_dt.select("prediction", "label").map(r => (r.getDouble(0), r.getDouble(1)))
val bMetrics_dt = new BinaryClassificationMetrics(predictionLabelsRDD_dt.rdd)
println("Confusion Matrix of Decision Tree is")
println(metrics_dt.confusionMatrix)
println("\nThe Accuracy of Decision Tree is")
println(metrics_dt.accuracy)
println("\nThe ROC for Decision Tree is")
println(bMetrics_dt.roc.collect().foreach(print))
print("\n")
println("\nThe Area under ROC for Decision Tree is")
println(bMetrics_dt.areaUnderROC)
println("\nThe Precision of Decision Tree is")
println(metrics_dt.precision)
println("\nThe Precision by Threshold for Decision Tree is")
println(bMetrics_dt.precisionByThreshold.collect().foreach(print))
println("\nExecution Time for Training and Prediction of Decision Tree is")
println(endTime_dt-startTime_dt + "ms")

Confusion Matrix of Decision Tree is
7471.0  6.0    
30.0    308.0  

The Accuracy of Decision Tree is
0.9953934740882917

The ROC for Decision Tree is
(0.0,0.0)(8.024608800320985E-4,0.9112426035502958)(1.0,1.0)(1.0,1.0)()

The Area under ROC for Decision Tree is
0.9552200713351319

The Precision of Decision Tree is
0.9953934740882917

The Precision by Threshold for Decision Tree is
(1.0,0.9808917197452229)(0.0,0.043250159948816376)()

Execution Time for Training and Prediction of Decision Tree is
6882ms


### Analysis

Decision Trees are known for over-fitting the data and the the results above support that statement especially the accuracy rate of 99.5%. This is mainly because of that fact that decision Tree does not incorporate bootstrapping or any other technique to understand complete trend in the data set.It takes a random column as the root and then start building the decision tree from there. This is one reason for over-fitting. Multiple executions with different seed values for train-test split revealed that decision tree is over-fitting the data and also strengthened the doubts on the actual dataset itself.

Random Forest
------------

In [10]:
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
var startTime_rf = System.currentTimeMillis()
val rf = new RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(500)
val predictionDF_rf = rf.fit(train).transform(test)
var endTime_rf = System.currentTimeMillis()

In [11]:
//Model Evaluation
val predictionAndLabels_MCM_rf = predictionDF_rf.select("prediction","label").as[(Double,Double)].rdd
val metrics_rf = new MulticlassMetrics(predictionAndLabels_MCM_rf)
val predictionLabelsRDD_rf = predictionDF_rf.select("prediction", "label").map(r => (r.getDouble(0), r.getDouble(1)))
val bMetrics_rf = new BinaryClassificationMetrics(predictionLabelsRDD_rf.rdd)
println("Confusion Matrix of Random Forest is")
println(metrics_rf.confusionMatrix)
println("\nThe Accuracy of Random Forest is")
println(metrics_rf.accuracy)
println("\nThe ROC for Random Forest is")
println(bMetrics_rf.roc.collect().foreach(print))
print("\n")
println("\nThe Area under ROC for Random Forest is")
println(bMetrics_rf.areaUnderROC)
println("\nThe Precision of Random Forest is")
println(metrics_rf.precision)
println("\nThe Precision by Threshold for Random Forest is")
println(bMetrics_rf.precisionByThreshold.collect().foreach(print))
println("\nExecution Time for Training and Prediction of Random Forest is")
println(endTime_rf-startTime_rf + "ms")

Confusion Matrix of Random Forest is
7473.0  4.0    
46.0    292.0  

The Accuracy of Random Forest is
0.9936020473448497

The ROC for Random Forest is
(0.0,0.0)(5.349739200213989E-4,0.863905325443787)(1.0,1.0)(1.0,1.0)()

The Area under ROC for Random Forest is
0.9316851757618828

The Precision of Random Forest is
0.9936020473448497

The Precision by Threshold for Random Forest is
(1.0,0.9864864864864865)(0.0,0.043250159948816376)()

The Model Execution Time for Random Forest is
22949ms


### Analysis

Unlike Decision Trees, Random Forest has bootstrapping aggragation and other techniques incorporated to the model itself for better performance. To get the best out of the model we have set the number of trees to 500 as well. This increased the execution time significantly compared to the logistic regression and decision tree. The accuracy was expecting to be high but the 'Precision by Threshold' evaluation paramater revealed that our dataset is having imbalanced data.

Gradient-boosted tree classifier
--------------------------------

In [6]:
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
var startTime_gbt = System.currentTimeMillis()
val gbt = new GBTClassifier().setLabelCol("label").setFeaturesCol("features").setMaxIter(10)
val predictionDF_gbt = gbt.fit(train).transform(test)
var endTime_gbt = System.currentTimeMillis()

In [7]:
//Model Evaluation
val predictionAndLabels_MCM_gbt = predictionDF_gbt.select("prediction","label").as[(Double,Double)].rdd
val metrics_gbt = new MulticlassMetrics(predictionAndLabels_MCM_gbt)
val predictionLabelsRDD_gbt = predictionDF_gbt.select("prediction", "label").map(r => (r.getDouble(0), r.getDouble(1)))
val bMetrics_gbt = new BinaryClassificationMetrics(predictionLabelsRDD_gbt.rdd)
println("Confusion Matrix of Gradient-Boosted Tree is")
println(metrics_gbt.confusionMatrix)
println("\nThe Accuracy of Gradient-Boosted Tree is")
println(metrics_gbt.accuracy)
println("\nThe ROC for Gradient-Boosted Tree is")
println(bMetrics_gbt.roc.collect().foreach(print))
print("\n")
println("\nThe Area under ROC for Gradient-Boosted Tree is")
println(bMetrics_gbt.areaUnderROC)
println("\nThe Precision of Gradient-Boosted Tree is")
println(metrics_gbt.precision)
println("\nThe Precision by Threshold for Gradient-Boosted Tree is")
println(bMetrics_gbt.precisionByThreshold.collect().foreach(print))
println("\nExecution Time for Training and Prediction of Gradient-Boosted Tree is")
println(endTime_gbt-startTime_gbt + "ms")

Confusion Matrix of Gradient-Boosted Tree is
7471.0  6.0    
29.0    309.0  

The Accuracy of Gradient-Boosted Tree is
0.9955214331413947

The ROC for Gradient-Boosted Tree is
(0.0,0.0)(8.024608800320985E-4,0.9142011834319527)(1.0,1.0)(1.0,1.0)()

The Area under ROC for Gradient-Boosted Tree is
0.9566993612759602

The Precision of Gradient-Boosted Tree is
0.9955214331413947

The Precision by Threshold for Gradient-Boosted Tree is
(1.0,0.9809523809523809)(0.0,0.043250159948816376)()

Execution Time for Training and Prediction of Gradient-Boosted Tree is
9647ms


### Analysis

Gradient-Boosted Tree which is considered as the best performing model of all the tree models and was expected to shine in model evaluation. But since we got an idea from Random Forest evaluation that we are dealing with an highly imbalanced dataset, the only parameter of interest in this evaluation was 'Precision By Threshold'. This again proved that our data is imbalanced and the evaluation parameters are not acceptable

## CASE STUDY 1 CONCLUSION

It has been observed that there is a significant imbalance in data. A clear analysis on the dataset revealed that out of 26217 records available 25037 records states the person is 'alive' and only 1180 records corresponds to dead people. Training any model with this kind of data will not train the model effectively and most of the time the model will tend to predict the person is 'alive', which is happening in our scenario. So either we will have to under-sample the data or over-sample it. Since in our case under-sampling would reduce the number of records significantly, over-sampling will be the best approach. Alternately we can assign weights for the records as well to train the model. This leads us to CASE STUDY 2. It has also been observed that under default parameters Logistics regression's execution time is the fastest while Random Forest is the slowest. Gradient Boost Tree is a better choice in that regard too.