In [3]:
import os, sys
import pyspark
from pyspark.sql import SQLContext

from pyspark.sql import SparkSession # to work with dataframes

sqlContext = SparkSession.builder.appName("test").enableHiveSupport().getOrCreate()

import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

import pyspark.sql.functions as F # to work with dataframes siimilar to rdd.map()

import random

from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

In [4]:
#df1 = sqlContext.read.csv('/home/shashank/Documents/gitWorkspace/SFFD-Spark-Project/Data/FireDepartmentBigCleaned.csv')
df1 = sqlContext.read.parquet('/home/shashank/Documents/gitWorkspace/SFFD-Spark-Project/Data/FireDepartmentBigHalfCleaned.parquet')
df1 = df1.repartition(8)


df1.limit(5).toPandas()

Unnamed: 0,Unit_ID,Incident_Number,Call_Type,Received_DtTm,On_Scene_DtTm,Available_DtTm,Zipcode_of_Incident,Station_Area,Box,Original_Priority,Call_Type_Group,Number_of_Alarms,Unit_Type,Fire_Prevention_District,Supervisor_District,Location,RowID,Received_month,Received_Hour,response_time
0,E02,6040419,Medical Incident,2006-05-26 05:54:00,2006-05-26 05:58:00,2006-05-26 06:09:00,94133,2,1354,3,,1,ENGINE,1,3,"(37.7959000461735, -122.410798369528)",061460044-E02,5,5,4.0
1,B06,9052082,Structure Fire,2009-06-25 01:07:00,2009-06-25 01:11:00,2009-06-25 01:28:00,94110,11,555,3,,1,CHIEF,6,9,"(37.7479996809997, -122.420821864913)",091760166-B06,6,1,4.0
2,M01,1011894,Medical Incident,2001-02-08 01:40:00,2001-02-08 01:51:00,2001-02-08 02:25:00,94103,27,2316,2,,1,MEDIC,3,6,"(37.7812876870909, -122.411318055371)",010390227-M01,2,1,11.0
3,67,11056264,Medical Incident,2011-06-20 03:20:00,2011-06-20 03:42:00,2011-06-20 04:59:00,94102,1,1453,1,,1,MEDIC,3,6,"(37.7841410121878, -122.410951620282)",111710038-67,6,3,22.0
4,E43,2066310,Structure Fire,2002-08-11 06:22:00,2002-08-11 06:27:00,2002-08-11 06:38:00,94112,32,6114,3,,1,ENGINE,9,11,"(37.728489442639, -122.429995375728)",022230050-E43,8,6,5.0


# Lets go over each column and decide if we want to include them in the model and how we deal with null values in them.

## Unit_ID

#### This will not be included in training the model since this is not known ahead of time.

## Incident_Number

#### This will not be included in training the model since this is a random number assigned to identify the incident.

## Call_Type

#### This will be included in training the model. Since this is a categorical variable, we will use one hot encoding here.


In [5]:
df1.groupBy('Call_Type').count().show(25), df1.filter(df1['Call_Type'].isNull()).groupBy('Call_Type').count().show(25)

+--------------------+------+
|           Call_Type| count|
+--------------------+------+
|         Marine Fire|   237|
|Elevator / Escala...| 11101|
|  Aircraft Emergency|   482|
|Confined Space / ...|   367|
|      Administrative|    74|
|              Alarms|388555|
|Odor (Strange / U...| 10184|
|Lightning Strike ...|     6|
|Citizen Assist / ...| 61318|
|              HazMat|  2723|
|Watercraft in Dis...|   448|
|           Explosion|  1668|
|           Oil Spill|   403|
|        Vehicle Fire| 14304|
|  Suspicious Package|   235|
|   Train / Rail Fire|    21|
|Extrication / Ent...|   481|
|               Other| 50084|
|        Outside Fire| 41920|
|   Traffic Collision|144530|
|       Assist Police|   895|
|Gas Leak (Natural...| 16284|
|        Water Rescue| 12189|
|   Electrical Hazard| 12376|
|   High Angle Rescue|   816|
+--------------------+------+
only showing top 25 rows

+---------+-----+
|Call_Type|count|
+---------+-----+
+---------+-----+



(None, None)

##### Since there are no missing values here, we can just use one hot encoding here.

## Received_DtTm

#### This will not be included in training the model. This is the starting point from which we are predicting the response time.

In [6]:
df1.filter(df1.Received_DtTm.isNull()).count()

0

##### There are no missing values here. Although this won't be used in training, this column has been extracted into other columns which will be used in training.

## On_Scene_DtTm

#### This will not be included in training the model since this is essentially what we are predicting.

In [7]:
df1.filter(df1.On_Scene_DtTm.isNull()).count()

0

##### No missing values here. If there were any missing values here, we will have to drop the row.

## Available_DtTm

#### This will not be included in training the model since this is tied to a unit and there is no way of knowing this ahead of time.

## Zipcode_of_Incident

#### This will be included in training the model.

In [8]:
df1.select('Zipcode_of_Incident').distinct().count()

28

In [9]:
df1.filter(df1.Zipcode_of_Incident.isNull()).count()

2417

In [10]:
most_probable_zips = df1.groupBy('Zipcode_of_Incident')\
                        .count().orderBy('count', ascending=False)\
                        .select('Zipcode_of_Incident')\
                        .collect()[:3]

most_probable_zips_list = []
for x in most_probable_zips:
    most_probable_zips_list.append(x[0])
    
df1 = df1.fillna( { 'Zipcode_of_Incident': random.choice(most_probable_zips_list)} )

##### Zipcode is closely related to 'Box' which can be used to fill the missing values. This information will be known at the time of call so zipcode can be used for training the model.

#### Currently filled missing with top 3 zip codes.

## Station_Area

In [11]:
df1.filter(df1.Station_Area.isNull()).count(), df1.select('Station_Area').distinct().count()

(1728, 54)

In [12]:
df1.crosstab('Station_Area', 'Zipcode_of_Incident').limit(10).toPandas()

Unnamed: 0,Station_Area_Zipcode_of_Incident,94102,94103,94104,94105,94107,94108,94109,94110,94111,94112,94114,94115,94116,94117,94118,94121,94122,94123,94124,94127,94129,94130,94131,94132,94133,94134,94158
0,34.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41301,2695,0,0,0,0,0,0,0,0,0,0
1,,215,204,9,26,64,38,127,133,29,74,70,94,38,55,40,83,60,24,97,23,5,12,36,37,52,71,12
2,12.0,0,2,0,6,0,0,0,0,0,0,909,0,0,33651,50,0,12430,0,0,0,0,0,4440,0,4,0,0
3,8.0,34,20283,0,614,68021,5,0,449,390,0,0,0,0,2,37,53,131,0,0,0,0,7,0,0,0,0,10028
4,51.0,2822,0,239,0,0,1263,2,0,31,4,0,35,4,0,505,10,28,1670,1,0,8288,5,2,0,30,7,0
5,19.0,0,0,0,0,0,0,0,371,0,1,0,0,1099,0,0,0,0,0,0,3619,0,0,2,60410,0,0,0
6,23.0,0,0,0,0,0,0,0,3,0,0,0,0,9824,0,0,111,34191,0,0,0,0,0,0,440,0,0,0
7,40.0,0,0,0,0,0,0,0,0,0,0,0,0,33854,0,0,0,7303,0,0,288,0,0,0,581,0,0,0
8,9.0,3,172,0,22,2041,0,0,24104,0,0,3,0,0,0,0,0,0,0,10626,0,0,2216,0,0,0,15,0
9,15.0,0,0,0,0,0,0,0,205,0,56670,0,0,0,0,0,0,0,0,0,8840,0,0,1538,1086,0,1,0


In [13]:
x = df1.groupBy('Incident_Number').agg({"Station_Area":"count"}).show(5)

+---------------+-------------------+
|Incident_Number|count(Station_Area)|
+---------------+-------------------+
|        2070329|                  3|
|       10110217|                  2|
|        1005853|                 21|
|       16000185|                  2|
|       11074600|                  1|
+---------------+-------------------+
only showing top 5 rows



#### Although there are no missing values, this will not be included in training the model since the choosen station is based on availability of units and every station responds to multiple zip codes. A single incident can also have multiple stations responding so we just need to predict the response time considering the closest station.

## Box

#### This will not be included in training the model since this is information used only by SFFD for navigation or ease of use.

In [14]:
df1.filter(df1.Box.isNull()).count()

262

## Original_Priority

In [15]:
df1.groupBy('Original_Priority').count().show()

+-----------------+-------+
|Original_Priority|  count|
+-----------------+-------+
|                3|2737296|
|             null|  17834|
|                E| 123588|
|                B|  21538|
|                C|  11610|
|                A|  37252|
|                1| 141126|
|                I|   1885|
|                2| 598487|
+-----------------+-------+



#### This will be not included in training the model since this is a value assigned at the time of call and there is no obvious relation with other existing features.

## Call_Type_Group

#### This will be included in training the model. Since this is a categorical variable, we will use one hot encoding here.


In [16]:
df1.groupBy('Call_Type_Group').count().show()

+--------------------+-------+
|     Call_Type_Group|  count|
+--------------------+-------+
|               Alarm| 346676|
|                null|2072871|
|Potentially Life-...| 816433|
|Non Life-threatening| 397396|
|                Fire|  57240|
+--------------------+-------+



In [17]:
df1.crosstab('Call_Type_Group', 'Unit_Type').show()

+-------------------------+-------+------+------+------+-------+--------------+------------+-------+------+
|Call_Type_Group_Unit_Type|AIRPORT| CHIEF|ENGINE| MEDIC|PRIVATE|RESCUE CAPTAIN|RESCUE SQUAD|SUPPORT| TRUCK|
+-------------------------+-------+------+------+------+-------+--------------+------------+-------+------+
|                     null|   7931|135479|859260|731341|  68088|         68857|       24496|   3406|174013|
|     Non Life-threatening|      2|  2138|107186|201277|  77096|          4706|         703|   1481|  2807|
|                    Alarm|      1| 78542|155500|  7207|   1243|           731|        3944|    378| 99130|
|                     Fire|      0| 10164| 28953|  3262|    347|          2039|        2436|   1403|  8636|
|     Potentially Life-...|      1|  6468|351400|292739| 109669|         26687|        6228|   5752| 17489|
+-------------------------+-------+------+------+------+-------+--------------+------------+-------+------+



#### Surprisingly, very few calls to the Fire Department are made because of some fire and more Medic units being requested from Fire Department relative to Fire Engines. 
#### It's safe to assume the missing  Call_Type_Group_Unit_Type as 'Potentially Life-threatening' since that is by far the most occuring 'Call_Type_Group' and the missing Call_Type_Group is less than 1%.

In [18]:
df1 = df1.fillna( { 'Call_Type_Group':'Potentially Life-threatening'} )

## Number_of_Alarms

#### This will be included in training the model.

In [19]:
df1.filter(df1.Number_of_Alarms.isNull()).count()

0

In [20]:
df1.groupBy('Number_of_Alarms').agg({"response_time":"avg"}).show()

+----------------+------------------+
|Number_of_Alarms|avg(response_time)|
+----------------+------------------+
|               1|  8.20739726303008|
|               3| 18.80141592920354|
|               5|18.808988764044944|
|               4|19.360347322720695|
|               2|15.972736452372938|
+----------------+------------------+



#### There are no missing values and clearly this is a very good indicator of our target.

## Unit_Type

#### This will be included in training the model. Since this is a categorical variable, we will use one hot encoding here.

In [21]:
df1.filter(df1.Unit_Type.isNull()).count()

0

In [22]:
df1.groupBy('Unit_Type').count().show()

+--------------+-------+
|     Unit_Type|  count|
+--------------+-------+
|       AIRPORT|   7935|
|         MEDIC|1235826|
|         CHIEF| 232791|
|  RESCUE SQUAD|  37807|
|RESCUE CAPTAIN| 103020|
|         TRUCK| 302075|
|        ENGINE|1502299|
|       SUPPORT|  12420|
|       PRIVATE| 256443|
+--------------+-------+



In [23]:
df1.groupBy('Unit_Type').agg({"response_time":"avg"}).show()

+--------------+------------------+
|     Unit_Type|avg(response_time)|
+--------------+------------------+
|       AIRPORT|13.637177063642092|
|         MEDIC|10.210417971462002|
|         CHIEF| 7.641700065724191|
|  RESCUE SQUAD| 7.256936546142248|
|RESCUE CAPTAIN| 10.33191613278975|
|         TRUCK| 6.812198957212613|
|        ENGINE| 6.302190176522783|
|       SUPPORT|12.641384863123994|
|       PRIVATE|11.200633279130255|
+--------------+------------------+



#### There are no missing values. We can see some association with the target so this could be a good feature. We can also observe the 'MEDIC' and 'ENGINE' are the most common units which can be used in place of any missing values.

## Fire_Prevention_District and Supervisor_District

#### This will be included in training the model. Since this is a categorical variable, we will use one hot encoding here.

In [24]:
df1.filter(df1.Fire_Prevention_District.isNull()).count(), df1.filter(df1.Supervisor_District.isNull()).count()

(0, 0)

In [25]:
df1.groupBy('Fire_Prevention_District').count().show()

+------------------------+------+
|Fire_Prevention_District| count|
+------------------------+------+
|                       7|231614|
|                       3|544686|
|                    None| 25831|
|                       8|283886|
|                       5|269944|
|                       6|292827|
|                       9|301469|
|                       1|409025|
|                      10|300578|
|                       4|337604|
|                       2|693152|
+------------------------+------+



In [26]:
df1.groupBy('Supervisor_District').count().show()

+-------------------+------+
|Supervisor_District| count|
+-------------------+------+
|                  7|201480|
|                 11|165249|
|                  3|471274|
|               None|  2276|
|                  8|249244|
|                  5|378285|
|                  6|986468|
|                  9|325983|
|                  1|188851|
|                 10|341690|
|                  4|143811|
|                  2|236005|
+-------------------+------+



In [27]:
df1.groupBy('Supervisor_District').agg({"response_time":"avg"}).show()

+-------------------+------------------+
|Supervisor_District|avg(response_time)|
+-------------------+------------------+
|                  7| 9.131447290053604|
|                 11| 8.693916453352214|
|                  3| 7.870773690040189|
|               None|13.242970123022847|
|                  8| 8.000337019145897|
|                  5|7.5794044173044135|
|                  6| 8.282187562090204|
|                  9|  7.92877235929481|
|                  1| 8.359283244462565|
|                 10| 8.829471158067253|
|                  4| 8.888673328187691|
|                  2| 8.022414779347896|
+-------------------+------------------+



In [28]:
df1.groupBy('Fire_Prevention_District').agg({"response_time":"avg"}).show()

+------------------------+------------------+
|Fire_Prevention_District|avg(response_time)|
+------------------------+------------------+
|                       7|  8.43057846244182|
|                       3| 8.426726591100193|
|                    None|11.915992412217877|
|                       8| 9.101998689614845|
|                       5| 7.694510713333136|
|                       6| 7.938789797388902|
|                       9| 8.943931216808362|
|                       1| 7.954315750870974|
|                      10| 8.732089507548789|
|                       4| 7.571542398786744|
|                       2| 7.837135289229491|
+------------------------+------------------+



In [29]:
df1.crosstab('Supervisor_District', 'Fire_Prevention_District').show()

+--------------------------------------------+------+------+------+------+------+------+------+------+------+------+-----+
|Supervisor_District_Fire_Prevention_District|     1|    10|     2|     3|     4|     5|     6|     7|     8|     9| None|
+--------------------------------------------+------+------+------+------+------+------+------+------+------+------+-----+
|                                           8|     0|     0| 78721|     0|     0| 50098|114088|     0|     3|  6334|    0|
|                                           4|     0|     0|     0|     0|     0|     0|     0|  2923|140888|     0|    0|
|                                          11|     0|     0|     0|     0|     0|     0|     0|     0|     0|165171|   78|
|                                           9|     0| 40110| 90419|     0|     0|     0|178049|     0|     0| 17405|    0|
|                                           5|     0|     0|114212|     0| 56285|182352|     0| 15899|  9537|     0|    0|
|               

#### There is very little differencebetween response times among the different 'Supervisor_District' or 'Fire_Prevention_District' so we could use some educated guess here to fill the nul lvalues. Above cross tabulation shows, most of the missing 'Fire_Prevention_District' falls across Supervisor_District = 6 and when Supervisor_District = 6, probablity of Fire_Prevention_District falls among [2,3,4]. We could replace the nulls in Fire_Prevention_District with on of [2,3,4] and respectively we can in turn use that to replace nulls in Supervisor_District with 6.

In [30]:
df1 = df1.replace('None',None)

missing_Fire_Prevention_District = [2]*7+[3]*8+[4]

df1 = df1.fillna( { 'Fire_Prevention_District':random.choice(missing_Fire_Prevention_District),\
                   'Supervisor_District':6} )

## Location

#### This will not be included in training the model since this accurate location is recorded after the incident has been resolved.

## RowID

#### This will not be included in training the model since this is a random number assigned to identify the row.

## Received_month

#### This will be included in training the model.

## Received_Hour

#### This will be included in training the model.

## response_time

#### This will be included in training the model since this the target we are trying to predict.

In [31]:
df1 = df1.withColumn('label', F.col('response_time'))

## avg_response_history

#### This will be included in training the model. This is a very good indicator of response time and can be easily calculated.

In [32]:
df1 = df1.select(['Call_Type', 'Zipcode_of_Incident', 'Station_Area', 'Received_DtTm',
                  'Box', 'Original_Priority', 'Call_Type_Group', 
                  'Number_of_Alarms', 'Unit_Type', 'Fire_Prevention_District', 
                  'Supervisor_District', 'Received_month', 'Received_Hour', 
                  'response_time', 'label'])

df1.limit(5).toPandas()

Unnamed: 0,Call_Type,Zipcode_of_Incident,Station_Area,Received_DtTm,Box,Original_Priority,Call_Type_Group,Number_of_Alarms,Unit_Type,Fire_Prevention_District,Supervisor_District,Received_month,Received_Hour,response_time,label
0,Alarms,94114,6,2011-02-10 10:44:00,5232,3,Potentially Life-threatening,1,CHIEF,5,8,2,10,5.0,5.0
1,Medical Incident,94109,3,2016-05-05 04:15:00,3121,3,Potentially Life-Threatening,1,PRIVATE,4,6,5,4,7.0,7.0
2,Medical Incident,94116,40,2015-10-12 05:44:00,7444,3,Potentially Life-Threatening,1,MEDIC,8,4,10,5,9.0,9.0
3,Medical Incident,94131,26,2011-03-25 09:43:00,5573,3,Potentially Life-threatening,1,ENGINE,6,8,3,9,8.0,8.0
4,Medical Incident,94102,1,2001-04-19 05:55:00,1453,1,Potentially Life-threatening,1,MEDIC,3,6,4,5,15.0,15.0


In [33]:
for i in df1.columns:
    print i," = " , df1.filter(df1[i].isNull()).count()
    
df1.count()

Call_Type  =  0
Zipcode_of_Incident  =  0
Station_Area  =  1728
Received_DtTm  =  0
Box  =  262
Original_Priority  =  17834
Call_Type_Group  =  0
Number_of_Alarms  =  0
Unit_Type  =  0
Fire_Prevention_District  =  0
Supervisor_District  =  0
Received_month  =  0
Received_Hour  =  0
response_time  =  0
label  =  0


3690616

In [35]:
df1 = df1.filter(df1.Station_Area.isNotNull()).filter(df1.Box.isNotNull()).filter(df1.Original_Priority.isNotNull())

In [36]:
for i in df1.columns:
    print i," = " , df1.filter(df1[i].isNull()).count()
    
df1.count()

Call_Type  =  0
Zipcode_of_Incident  =  0
Station_Area  =  0
Received_DtTm  =  0
Box  =  0
Original_Priority  =  0
Call_Type_Group  =  0
Number_of_Alarms  =  0
Unit_Type  =  0
Fire_Prevention_District  =  0
Supervisor_District  =  0
Received_month  =  0
Received_Hour  =  0
response_time  =  0
label  =  0


3670808

In [37]:
train_df = df1.filter(df1['Received_DtTm'] < '2019-01-01 00:00:00' ).drop('Received_DtTm')
print('Train samples = ', train_df.count() )

test_df = df1.filter(df1['Received_DtTm'] >= '2019-01-01 00:00:00' ).drop('Received_DtTm')
print('Test samples = ', test_df.count() )

('Train samples = ', 3616212)
('Test samples = ', 54596)


In [38]:
column_vec_in = ['Call_Type','Zipcode_of_Incident','Call_Type_Group',\
                 'Unit_Type','Fire_Prevention_District',\
                 'Supervisor_District','Received_month','Received_Hour']

column_vec_out = ['Call_Type_Vec','Zipcode_of_Incident_Vec','Call_Type_Group_Vec',\
                 'Unit_Type_Vec','Fire_Prevention_District_Vec',\
                 'Supervisor_District_Vec','Received_month_Vec','Received_Hour_Vec']

indexers = [StringIndexer(inputCol=x, outputCol=x+'_indexed') \
           for x in column_vec_in]

encoders = [OneHotEncoder(dropLast=False, inputCol=x+'_indexed', outputCol=y) \
           for x,y in zip(column_vec_in, column_vec_out)]

tmp = [[i,j] for i,j in zip(indexers, encoders)]
tmp = [i for sublist in tmp for i in sublist]

In [40]:
cols_now = ['Call_Type_Vec','Zipcode_of_Incident_Vec','Call_Type_Group_Vec',\
            'Unit_Type_Vec','Fire_Prevention_District_Vec',\
            'Supervisor_District_Vec','Received_month_Vec','Received_Hour_Vec',\
            ]

assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features')

tmp += [assembler_features]
pipeline = Pipeline(stages=tmp)

In [41]:
transformed_train_df = pipeline.fit(train_df).transform(train_df).select(['label','features'])
transformed_train_df.cache()

transformed_test_df = pipeline.fit(test_df).transform(test_df).select(['label','features'])
transformed_test_df.cache()
transformed_test_df.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [42]:
transformed_train_df.count(), transformed_test_df.count()

(3616212, 54596)

In [45]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="features",numTrees=1, maxDepth=1, seed=2)

# Train model.  This also runs the indexer.
model = rf.fit(transformed_train_df)

# Make predictions.
predictions = model.transform(transformed_test_df)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 5.79654


In [44]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Train a RandomForest model.
rf = LinearRegression(featuresCol="features")

# Train model.  This also runs the indexer.
model = rf.fit(transformed_train_df)

# Make predictions.
predictions = model.transform(transformed_test_df)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Py4JJavaError: An error occurred while calling o1998.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 448.0 failed 1 times, most recent failure: Lost task 4.0 in stage 448.0 (TID 5056, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:619)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes: x.size = 101, y.size = 118
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.linalg.BLAS$.dot(BLAS.scala:104)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:707)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:644)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:215)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:214)
	... 34 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary$lzycompute(RegressionMetrics.scala:57)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary(RegressionMetrics.scala:54)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr$lzycompute(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.meanSquaredError(RegressionMetrics.scala:100)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError(RegressionMetrics.scala:109)
	at org.apache.spark.ml.evaluation.RegressionEvaluator.evaluate(RegressionEvaluator.scala:86)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:619)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.IllegalArgumentException: requirement failed: BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes: x.size = 101, y.size = 118
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.linalg.BLAS$.dot(BLAS.scala:104)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:707)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:644)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:215)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:214)
	... 34 more


In [None]:
mae and rmse and do logisticR