In [1]:
# !apt update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
# !tar -xvf spark-3.3.0-bin-hadoop3.tgz
# !pip install -q findspark
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"
import findspark
findspark.init()

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# %cd '/content/gdrive/My Drive/LDS9/Practice/Chapter8/'

/content/gdrive/My Drive/LDS9/Practice/Chapter8


# Demo Tree Model

### Dataset: flights.csv
- You'll build a regression model to predict flight delay or not
- With 'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration' as a predictor

First thing to do is start a Spark Session

In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [4]:
spark = SparkSession.builder.appName('Tree_demo').getOrCreate()

In [5]:
# Use Spark to read flights.csv file.
data = spark.read.csv("flights.csv",inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [7]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [8]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [9]:
# for item in data.head():
#     print(item)

In [10]:
data.count()

50000

In [11]:
# Remove the 'flight' column
data = data.drop('flight')

In [12]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [13]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [14]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [15]:
# Import the required function
from pyspark.sql.functions import round

In [16]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [17]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 0).cast('integer'))
# Check first five records
data.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| NULL|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



## Categories data

In [18]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [19]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')
indexer_model = indexer.fit(data)
data_indexed = indexer_model.transform(data)

# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org',
                             outputCol='org_idx').fit(data_indexed).transform(data_indexed)

# Create an encoder
data_indexed = OneHotEncoder(inputCol="carrier_idx",
                        outputCol="carrier_vec",
                        dropLast=True).fit(data_indexed).transform(data_indexed)

data_indexed = OneHotEncoder(inputCol="org_idx",
                        outputCol="org_vec",
                        dropLast=True).fit(data_indexed).transform(data_indexed)

In [20]:
data_indexed.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|  carrier_vec|      org_vec|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| NULL|        6.0|    2.0|(8,[6],[1.0])|(7,[2],[1.0])|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|(8,[0],[1.0])|(7,[1],[1.0])|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+
only showing top 3 rows



## Setting Up DataFrame for Machine Learning

## Assembling columns

In [21]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [22]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idx',
 'org_idx',
 'carrier_vec',
 'org_vec']

In [23]:
# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_vec', 'org_vec', 'km', 'depart', 'duration'
], outputCol='features')

In [24]:
data_pre = assembler.transform(data_indexed)

In [25]:
# Check the resulting column
data_pre.select('features', 'label').show(2, truncate=False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|NULL |
|(21,[1,2,3,11,18,19,20],[22.0,2.0,1.0,1.0,509.0,16.33,82.0])        |1    |
+--------------------------------------------------------------------+-----+
only showing top 2 rows



In [26]:
data_pre.show(3, False)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+--------------------------------------------------------------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|carrier_vec  |org_vec      |features                                                            |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+--------------------------------------------------------------------+
|11 |20 |6  |US     |JFK|2153|9.48  |351     |NA   |3465.0|NULL |6.0        |2.0    |(8,[6],[1.0])|(7,[2],[1.0])|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|
|0  |22 |2  |UA     |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |(8,[0],[1.0])|(7,[0],[1.0])|(21,[1,2,3,11,18,19,20],[22.0,2.0,1.0,1.0,509.0,16.33,82.0])        |
|2  |20 |4  |UA     |SFO|337 |6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |

In [27]:
final_data = data_pre.select("features","label")
final_data.count()

50000

In [28]:
final_data = final_data.na.drop()
final_data.count()

47022

In [29]:
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(21,[1,2,3,11,18,...|    1|
|(21,[0,1,2,3,12,1...|    0|
|(21,[0,1,2,4,11,1...|    0|
|(21,[0,1,2,3,12,1...|    1|
|(21,[0,1,2,4,11,1...|    1|
+--------------------+-----+
only showing top 5 rows



In [30]:
new_data = data_pre.select("features", "label").filter(data_pre.label.isNull())
new_data.show(3, False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|NULL |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |NULL |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |NULL |
+--------------------------------------------------------------------+-----+
only showing top 3 rows



In [31]:
new_data.count()

2978

In [32]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [33]:
train_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|             37711|
|   mean|0.6520378669353769|
| stddev|0.4763302463396479|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [34]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              9311|
|   mean|0.6552464826549242|
| stddev|0.4753133636847873|
|    min|                 0|
|    max|                 1|
+-------+------------------+



# Decision Tree
- ...

In [35]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassificationModel

In [39]:
tree  = DecisionTreeClassifier(featuresCol='features',
                              labelCol='label',
                              predictionCol='prediction')

In [40]:
tree_model = tree.fit(train_data)

In [41]:
test_model = tree_model.transform(test_data)

In [42]:
test_model.select('label','prediction','probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.35175879396984927,0.6482412060301508]|
|1    |1.0       |[0.35175879396984927,0.6482412060301508]|
|1    |1.0       |[0.21816562778272486,0.7818343722172751]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [43]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,  BinaryClassificationEvaluator

In [44]:
multi_evaluator = MulticlassClassificationEvaluator()
acc_ = multi_evaluator.evaluate(test_model,
                               {multi_evaluator.metricName: 'accuracy'})

In [45]:
acc_

0.6789818494254108

In [46]:
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(test_model,
                               {multi_evaluator.metricName: 'areaUnderROC'})

In [47]:
auc

0.6392481238712205

In [50]:
tree_model.save('tree_model_Flights_50k_new')

In [51]:
from pyspark.ml.classification import DecisionTreeClassificationModel
tree_model2 = DecisionTreeClassificationModel.load('tree_model_Flights_50k_new')

In [52]:
unlabeled_data = new_data.select('features')

In [53]:
predictions = tree_model2.transform(unlabeled_data)

In [54]:
predictions.show(3,False)

+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|features                                                            |rawPrediction  |probability                             |prediction|
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[138.0,256.0]  |[0.350253807106599,0.649746192893401]   |1.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[419.0,370.0]  |[0.5310519645120405,0.46894803548795944]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[3229.0,6981.0]|[0.31625857002938296,0.683741429970617] |1.0       |
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
only showing top 3 rows



# RandomForest

In [36]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

In [55]:
rfc  = RandomForestClassifier(featuresCol='features',
                              labelCol='label',
                              predictionCol='prediction')

In [56]:
rfc_model = rfc.fit(train_data)

In [57]:
print('Number of trees: ', rfc_model.getNumTrees)
print('Relative importance of features: ', rfc_model.featureImportances)

Number of trees:  20
Relative importance of features:  (21,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],[0.27754440220854676,0.024870109332048522,0.010974450144097782,0.0005555057463611923,0.0018465415137307434,0.006528136463773494,0.04373969073530822,0.001483468067091594,0.008727270960590689,0.011231794037719506,0.033346389014303215,0.02142492521506125,0.009873274215384698,0.001821971375443212,0.0010206910973612877,0.08248430118737764,0.0028344218187786566,0.0025449415586059976,0.07247126122505815,0.29705126220860667,0.08762519187475074])


In [58]:
carrier_vec_list = ["carrier_vec"+str(i) for i in range(0, 8)]
org_vec_list = ['org_vec'+str(i) for i in range(0,7)]
carrier_vec_list
org_vec_list

['org_vec0',
 'org_vec1',
 'org_vec2',
 'org_vec3',
 'org_vec4',
 'org_vec5',
 'org_vec6']

In [59]:
feature_cols = ['mon', 'dom', 'dow']+carrier_vec_list+org_vec_list+['km', 'depart', 'duration']
feature_cols

['mon',
 'dom',
 'dow',
 'carrier_vec0',
 'carrier_vec1',
 'carrier_vec2',
 'carrier_vec3',
 'carrier_vec4',
 'carrier_vec5',
 'carrier_vec6',
 'carrier_vec7',
 'org_vec0',
 'org_vec1',
 'org_vec2',
 'org_vec3',
 'org_vec4',
 'org_vec5',
 'org_vec6',
 'km',
 'depart',
 'duration']

In [60]:
# Convert feature importances to a DataFrame
values_cols = rfc_model.featureImportances.toArray()
print(feature_cols)
print(values_cols)

['mon', 'dom', 'dow', 'carrier_vec0', 'carrier_vec1', 'carrier_vec2', 'carrier_vec3', 'carrier_vec4', 'carrier_vec5', 'carrier_vec6', 'carrier_vec7', 'org_vec0', 'org_vec1', 'org_vec2', 'org_vec3', 'org_vec4', 'org_vec5', 'org_vec6', 'km', 'depart', 'duration']
[0.2775444  0.02487011 0.01097445 0.00055551 0.00184654 0.00652814
 0.04373969 0.00148347 0.00872727 0.01123179 0.03334639 0.02142493
 0.00987327 0.00182197 0.00102069 0.0824843  0.00283442 0.00254494
 0.07247126 0.29705126 0.08762519]


In [61]:
# Convert feature importances to a pandas column
import pandas as pd
rf_df = pd.DataFrame(values_cols, columns=['importance'])

# Convert list of feature names to pandas column
rf_df['feature'] = pd.Series(feature_cols)

# Sort the data based on feature importance
rf_df.sort_values(by=['importance'], ascending=False, inplace=True)


# Inspect Results
rf_df.head(5)

Unnamed: 0,importance,feature
19,0.297051,depart
0,0.277544,mon
20,0.087625,duration
15,0.082484,org_vec4
18,0.072471,km


In [62]:
rfc_test_model = rfc_model.transform(test_data)

In [63]:
rfc_test_model.select('label','prediction','probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.37567207268843655,0.6243279273115635]|
|1    |1.0       |[0.37114154825242646,0.6288584517475735]|
|1    |1.0       |[0.26834323930748955,0.7316567606925104]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [64]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,  BinaryClassificationEvaluator

In [69]:
multi_evaluator = MulticlassClassificationEvaluator()
acc_rfc = multi_evaluator.evaluate(rfc_test_model,
                               {multi_evaluator.metricName: 'accuracy'})

In [71]:
acc_rfc

0.6687788637095908

In [72]:
binary_evaluator = BinaryClassificationEvaluator()
auc_rfc = binary_evaluator.evaluate(rfc_test_model,
                               {multi_evaluator.metricName: 'areaUnderROC'})

In [73]:
auc_rfc

0.6684364087190654

In [74]:
rfc_model.save('rfc_model_Flights_50k_new')

In [75]:
from pyspark.ml.classification import RandomForestClassificationModel
rfc_model2 = RandomForestClassificationModel.load('rfc_model_Flights_50k_new')

In [76]:
unlabeled_data = new_data.select('features')

In [77]:
predictions_rfc = tree_model2.transform(unlabeled_data)

In [78]:
predictions_rfc.show(3,False)

+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|features                                                            |rawPrediction  |probability                             |prediction|
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[138.0,256.0]  |[0.350253807106599,0.649746192893401]   |1.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[419.0,370.0]  |[0.5310519645120405,0.46894803548795944]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[3229.0,6981.0]|[0.31625857002938296,0.683741429970617] |1.0       |
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
only showing top 3 rows



# GBT - Gradient-Boosted Trees

In [37]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel

In [79]:
gbt  = GBTClassifier(featuresCol='features',
                     labelCol='label',
                     predictionCol='prediction')

In [80]:
gbt_model = gbt.fit(train_data)

In [81]:
print('Number of trees: ', gbt_model.getNumTrees)
print('Relative importance of features: ', gbt_model.featureImportances)

Number of trees:  20
Relative importance of features:  (21,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],[0.1952687388836448,0.14357798960871457,0.1336241570513309,0.002668607262488619,0.024021094599743935,0.01272903753280383,0.02008560522826332,0.01790203408574992,0.019355110851449024,0.012389298331966026,0.0058246470936939255,0.03578731163449342,0.02887239080816879,0.019802398405262066,0.01713402251075098,0.022726363640226645,0.016527290422581595,0.007488270876826352,0.06567414154571902,0.15149478516662412,0.047046704459498136])


In [82]:
gbt_test_model = gbt_model.transform(test_data)

In [83]:
gbt_test_model.select('label','prediction','probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.28008990012391427,0.7199100998760857]|
|1    |1.0       |[0.2731238444539252,0.7268761555460748] |
|1    |1.0       |[0.11910224605775438,0.8808977539422456]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [85]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,  BinaryClassificationEvaluator

In [89]:
multi_evaluator = MulticlassClassificationEvaluator()
acc_gbt = multi_evaluator.evaluate(gbt_test_model,
                               {multi_evaluator.metricName: 'accuracy'})

In [90]:
acc_gbt

0.6944474277736011

In [91]:
binary_evaluator = BinaryClassificationEvaluator()
auc_gbt = binary_evaluator.evaluate(gbt_test_model,
                               {multi_evaluator.metricName: 'areaUnderROC'})

In [92]:
auc_gbt

0.7171806266374795

In [93]:
gbt_model.save('gbt_model_Flights_50k_new')

In [95]:
from pyspark.ml.classification import GBTClassificationModel
rfc_model2 = GBTClassificationModel.load('gbt_model_Flights_50k_new')

In [96]:
unlabeled_data = new_data.select('features')

In [97]:
predictions_gbt = tree_model2.transform(unlabeled_data)

In [98]:
predictions_gbt.show(3,False)

+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|features                                                            |rawPrediction  |probability                             |prediction|
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[138.0,256.0]  |[0.350253807106599,0.649746192893401]   |1.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[419.0,370.0]  |[0.5310519645120405,0.46894803548795944]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[3229.0,6981.0]|[0.31625857002938296,0.683741429970617] |1.0       |
+--------------------------------------------------------------------+---------------+----------------------------------------+----------+
only showing top 3 rows

