# Predictive Data Analytics

In [1]:
import os

from pyspark.sql import Row
from pyspark.sql import SparkSession, DataFrame, Row
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, split, when, sin, cos
from pyspark.sql.types import IntegerType, StringType
from pyspark import keyword_only
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.linalg import Vector
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Tokenizer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import Evaluator, RegressionEvaluator 
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np
import math
from pprint import pprint

## Read Hive tables

### Connect to Hive

In [2]:
# Add here your team number teamx
team = "team13"

# Location of Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder \
        .appName("{} - spark ML".format(team)) \
        .master("yarn") \
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883") \
        .config("spark.sql.warehouse.dir", warehouse) \
        .config("spark.sql.avro.compression.codec", "snappy") \
        .config("spark.executor.instances", 8) \
        .config("spark.executor.cores", 1) \
        .config("spark.executor.memory", "2g") \
        .config("spark.dynamicAllocation.enabled", "false") \
        .enableHiveSupport() \
        .getOrCreate()

In [3]:
spark

### List all databases

In [4]:
spark.sql("SHOW DATABASES;").show()

+--------------------+
|           namespace|
+--------------------+
|             default|
|             root_db|
|     team0_projectdb|
|team12_hive_proje...|
|    team13_projectdb|
|    team14_projectdb|
|    team15_projectdb|
|    team16_projectdb|
|    team17_projectdb|
|    team18_projectdb|
|    team19_projectdb|
|     team1_projectdb|
|    team20_projectdb|
|    team21_projectdb|
|    team22_projectdb|
|    team23_projectdb|
|    team24_projectdb|
|    team25_projectdb|
|    team26_projectdb|
|    team27_projectdb|
+--------------------+
only showing top 20 rows



### List all tables

In [5]:
print(spark.catalog.listTables("team13_projectdb"))

[Table(name='acquisitions', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='degrees', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funding_rounds', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funding_rounds_part', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funds', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='investments', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='ipos', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='milestones', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='objects', database='team13_projectdb', description=None, tableType='EXTER

### Read Hive tables

In [6]:
objects = spark.read.format("avro").table('team13_projectdb.objects_part')

fund_rounds = spark.read.format("avro").table('team13_projectdb.funding_rounds_part')

## ML Modeling

### Preprocessing the data

#### Feature selection


In [7]:
# Select features and the label
obj_features = ['id', 'status', 'category_code', 'country_code', 'investment_rounds', 'invested_companies', 'milestones', 'relationships']

fund_features = ['object_id', 'funded_at', 'funding_round_type', 'participants', 'is_first_round', 'is_last_round']
label = 'raised_amount_usd'

In [8]:
objects = objects.select(obj_features)
fund_rounds = fund_rounds.select(fund_features + [label])

In [9]:
# Join tables to form one Dataframe for the ML task
final = objects.join(fund_rounds, objects['id'] == fund_rounds['object_id'], how='right').drop('id').drop('object_id')

In [10]:
final.show(truncate=False)

+---------+----------------+------------+-----------------+------------------+----------+-------------+----------+------------------+------------+--------------+-------------+-----------------+
|status   |category_code   |country_code|investment_rounds|invested_companies|milestones|relationships|funded_at |funding_round_type|participants|is_first_round|is_last_round|raised_amount_usd|
+---------+----------------+------------+-----------------+------------------+----------+-------------+----------+------------------+------------+--------------+-------------+-----------------+
|operating|enterprise      |USA         |0                |0                 |3         |16           |2010-09-24|series-a          |2           |1             |0            |5600000.00       |
|closed   |web             |null        |0                |0                 |0         |2            |2008-01-01|angel             |0           |1             |1            |100000.00        |
|operating|other           |US

In [11]:
leisure = ['games_video', 'photo_video', 'social', 'hospitality', 'sports', 'fashion', 'messaging', 'music']
bizsupport = ['network_hosting', 'advertising', 'enterprise', 'consulting', 'analytics', 'public_relations', 'security', 'legal']
building = ['cleantech', 'manufacturing', 'semiconductor', 'automotive', 'real_eastate', 'nanotech']
petcare = ['pets']
travel = ['travel', 'transportation']
health = ['health', 'medical', 'biotech']
other = ['web', 'other', 'mobile', 'software', 'finance', 'education', 'ecommerce', 'search', 'hardware', 'news', 'government', 'nonprofit', 'local']

@udf(returnType=StringType())
def map_category_code(category_code):
    if category_code in leisure:
        return 'leisure'
    elif category_code in bizsupport:
        return 'bizsupport'
    elif category_code in building:
        return 'building'
    elif category_code in petcare:
        return 'petcare'
    elif category_code in travel:
        return 'travel'
    elif category_code in health:
        return 'health'
    else:
        return 'other'

final = final.withColumn('category_code', map_category_code(final['category_code']))

In [12]:
Africa = ['AGO', 'BDI', 'BEN', 'BWA', 'CIV', 'CMR', 'DZA', 'EGY', 'ETH', 'GHA', 'GIN', 'KEN', 'LSO', 'MAR', 'MDG', 'MUS', 'NAM', 'NER','NGA', 'REU','RWA', 'SDN','SEN', 'SLE', 'SOM','SWZ', 'SYC', 'TUN', 'TZA', 'UGA', 'ZAF', 'ZMB', 'ZWE']
Asia = ['AFG', 'ARE', 'BGD', 'BHR', 'BRN', 'CHN', 'HKG', 'IDN', 'IND', 'IOT', 'IRN', 'IRQ', 'ISR','JOR', 'JPN', 'KAZ', 'KGZ', 'KHM', 'KOR', 'KWT','LAO', 'LBN', 'LKA', 'MAC', 'MDV', 'MMR', 'MYS', 'NPL', 'OMN', 'PAK', 'PCN','PHL','PRK','PST', 'QAT', 'SAU', 'SGP','SYR', 'THA', 'TJK', 'TWN', 'UZB', 'VNM', 'YEM']
Europe = ['AIA', 'ALB', 'AND', 'ARM', 'AUT', 'AZE', 'BEL', 'BGR','BIH', 'BLR', 'CHE', 'CYP', 'CZE', 'DEU', 'DNK','ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GEO', 'GIB', 'GLB', 'GRC', 'HRV', 'HUN', 'IRL', 'ISL', 'ITA', 'LIE', 'LTU','LUX', 'LVA', 'MCO', 'MDA', 'MKD', 'MLT', 'NLD', 'NOR', 'POL', 'PRT', 'ROM', 'RUS', 'SMR', 'SVK', 'SVN','SWE', 'TUR', 'UKR']
North_America = ['ATG', 'BHS','BLZ', 'BMU', 'BRB', 'CAN', 'CRI','CUB','CYM', 'DMA', 'GRD', 'GTM', 'HND', 'HTI', 'JAM', 'MEX', 'MTQ', 'PAN', 'PRI', 'SLV', 'UMI','USA', 'VGB', 'VIR']
South_America = ['ARG', 'BOL', 'BRA', 'CHL', 'COL', 'DOM', 'ECU', 'NIC', 'PER', 'PRY', 'SUR', 'TTO', 'URY','VEN', 'VCT']
Other = ['ANT', 'ARA', 'AUS', 'CSS', 'FST', 'HMI','NCL', 'NFK','NRU', 'NZL']

@udf(returnType=StringType())
def map_country(country_code):
    if country_code in Africa:
        return 'Africa'
    elif country_code in Asia:
        return 'Asia'
    elif country_code in Europe:
        return 'Europe'
    elif country_code in North_America:
        return 'North_America'
    elif country_code in South_America:
        return 'South_America'
    else:
        return 'Other'
    
    
final = final.withColumn('country_code', map_country(final['country_code']))

In [13]:
# Split funded_at with datetime to year, month and day
split_col = F.split(F.to_date("funded_at"), "-")
final = final.withColumn("funded_year", split_col.getItem(0).cast(IntegerType())) \
            .withColumn("funded_month", split_col.getItem(1).cast(IntegerType())) \
            .withColumn("funded_day", split_col.getItem(2).cast(IntegerType()))
# Remove funded_at
final = final.drop("funded_at")

# Drop all records which contain nulls
final = final.na.drop()

# Remove 0 values in raised_amount_usd (label) 
final = final.filter("raised_amount_usd > 0")

# Transform raised_amount_usd (label) 
final = final.withColumn("raised_amount_usd", F.log10(col("raised_amount_usd")))

# Rename label
final = final.withColumnRenamed("raised_amount_usd", "label")

# Saving intermediate results
final = final.cache()

In [14]:
print("Size of the DataFrame: {} rows, {} columns".format(final.count(), len(final.columns)))

Size of the DataFrame: 46460 rows, 15 columns


In [15]:
final.printSchema()

root
 |-- status: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- investment_rounds: integer (nullable = true)
 |-- invested_companies: integer (nullable = true)
 |-- milestones: integer (nullable = true)
 |-- relationships: integer (nullable = true)
 |-- funding_round_type: string (nullable = true)
 |-- participants: integer (nullable = true)
 |-- is_first_round: integer (nullable = true)
 |-- is_last_round: integer (nullable = true)
 |-- label: double (nullable = true)
 |-- funded_year: integer (nullable = true)
 |-- funded_month: integer (nullable = true)
 |-- funded_day: integer (nullable = true)



In [16]:
final.show(truncate=False)

+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+------------------+-----------+------------+----------+
|status   |category_code|country_code |investment_rounds|invested_companies|milestones|relationships|funding_round_type|participants|is_first_round|is_last_round|label             |funded_year|funded_month|funded_day|
+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+------------------+-----------+------------+----------+
|closed   |other        |North_America|0                |0                 |3         |4            |series-a          |0           |1             |0            |6.361727836017593 |2011       |4           |25        |
|operating|health       |North_America|0                |0                 |0         |0            |series-a          |1       

#### Building the Pipeline

In [17]:
# Extract categorical, numerical and cyclical features
categorical_cols = ['status', 'funding_round_type', 'category_code', 'country_code']

numerical_cols = ['investment_rounds', 'invested_companies', 'milestones', 'relationships', 'funded_year', 'participants', 'is_first_round', 'is_last_round']

cyclical_cols = ['funded_month', 'funded_day']
periods = [12, 31]  # periods for months and days accordingly

In [18]:
# Build a custom tranformer to encode cyclical features
class CyclicTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString)
    outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString)
  
    @keyword_only
    def __init__(self, inputCol: str = "input", outputCol: str = "output", period: int = 12):
        super(CyclicTransformer, self).__init__()
        self._setDefault(inputCol=None, outputCol=None)
        kwargs = self._input_kwargs
        del(kwargs["period"])
        self.set_params(**kwargs)
        self.period = period
    
    @keyword_only
    def set_params(self, inputCol: str = "input", outputCol: str = "output"):
        kwargs = self._input_kwargs
        self._set(**kwargs)
    
    def getInputCol(self):
        return self.getOrDefault(self.inputCol)
  
    def getOutputCol(self):
        return self.getOrDefault(self.outputCol)
  
    def _transform(self, df: DataFrame):
        input_col = self.getInputCol()
        output_col = self.getOutputCol()
        
        sin_col = sin(2 * math.pi * df[input_col] / self.period) 
        cos_col = cos(2 * math.pi * df[input_col] / self.period)
       
        return df.withColumn(output_col + "_sin", sin_col).withColumn(output_col + "_cos", cos_col)

In [19]:
# Create String indexer to assign index for the string fields where each unique string will get a unique index
indexers = [StringIndexer(inputCol=c, 
                           outputCol="{0}_indexed".format(c)).setHandleInvalid("skip") for c in categorical_cols]

# Encode strings using One Hot encoding
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), 
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]

# Encode cyclical features using custom Cyclic Transformer
cyclic_transformers = [CyclicTransformer(inputCol=col, 
                                         outputCol=col + "_cyc_enc", 
                                         period=period) for col, period in zip(cyclical_cols, periods)]

# This will concatenate the input cols into a single column
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + 
                            [transformer.getOutputCol() + '_sin' for transformer in cyclic_transformers] + 
                            [transformer.getOutputCol() + '_cos' for transformer in cyclic_transformers] + 
                            numerical_cols, 
                            outputCol= "features")

# Create a pipeline to use only a single fit and transform on the data
pipeline = Pipeline(stages=indexers + encoders + cyclic_transformers + [assembler])

# Fit the pipeline ==> This will call the fit functions for all transformers if exist
model = pipeline.fit(final)

# Fit the pipeline ==> This will call the transform functions for all transformers
data = model.transform(final)

data.show()

+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+------------------+-----------+------------+----------+--------------+--------------------------+---------------------+--------------------+----------------------+----------------------------------+-----------------------------+----------------------------+------------------------+------------------------+----------------------+----------------------+--------------------+
|   status|category_code| country_code|investment_rounds|invested_companies|milestones|relationships|funding_round_type|participants|is_first_round|is_last_round|             label|funded_year|funded_month|funded_day|status_indexed|funding_round_type_indexed|category_code_indexed|country_code_indexed|status_indexed_encoded|funding_round_type_indexed_encoded|category_code_indexed_encoded|country_code_indexed_encoded|funded_month_cyc_enc_sin|funded_month_cyc_en

In [20]:
# Delete all features and keep only the features and label columns
data = data.select(["features", "label"])

data.show()

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|(34,[2,5,11,17,22...| 6.361727836017593|
|(34,[0,5,13,17,22...| 6.845098040014257|
|(34,[0,5,11,17,22...| 5.954242509439325|
|(34,[0,6,11,17,22...| 6.301029995663981|
|(34,[0,7,11,17,22...| 6.477121254719663|
|(34,[0,6,11,17,22...| 6.544068044350276|
|(34,[0,5,11,17,22...| 6.477121254719663|
|(34,[0,6,11,17,22...| 6.903089986991944|
|(34,[0,3,11,17,22...| 6.903089986991944|
|(34,[0,5,11,17,22...|6.3226513062053655|
|(34,[0,4,11,17,22...|               5.0|
|(34,[2,4,12,18,22...| 5.021189299069938|
|(34,[0,5,13,17,22...| 7.176091259055681|
|(34,[2,6,13,17,22...| 6.929418925714293|
|(34,[2,6,13,17,22...| 7.176091259055681|
|(34,[0,4,13,20,22...|               6.0|
|(34,[0,3,13,20,22...|   5.8750612633917|
|(34,[0,3,13,20,22...| 5.999999565705301|
|(34,[0,4,14,18,22...| 5.477121254719663|
|(34,[0,4,14,18,22...| 5.477121254719663|
+--------------------+------------

#### Split the dataset

In [21]:
# Split the data into 70% training and 30% test (it is not stratified)
(train_data, test_data) = data.randomSplit([0.7, 0.3], seed=10)

In [22]:
print("Size of training data: ({}, {})".format(train_data.count(), len(train_data.columns)))
print("Size of test data: ({}, {})".format(test_data.count(), len(test_data.columns)))

Size of training data: (32396, 2)
Size of test data: (14064, 2)


In [23]:
# Check the train test split ratio
print("Split ratio: {}, {}".format(round(train_data.count() / data.count(), 2), round(test_data.count() / data.count(), 2)))

Split ratio: 0.7, 0.3


In [24]:
# # A function to run commands
# def run(command):
#     return os.popen(command).read()

# train_data.select("features", "label")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/train")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/data/train/*.json > data/train.json")

# test_data.select("features", "label")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/test")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/data/test/*.json > data/test.json")

###  Modeling: First model

First model is Linear Regression

#### Build a model

In [25]:
# Create Linear Regression Model
lr = LinearRegression(maxIter=250, loss="huber")

# Fit the data to the lr model
model_lr = lr.fit(train_data)

#### Prediction

In [26]:
# Transform the data (Prediction)
predictions = model_lr.transform(test_data)

# Display the predictions
predictions.show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(34,[0,3,11,17,22...| 6.096910013008056| 6.362431764762427|
|(34,[0,3,11,17,22...| 6.643452676486188| 6.405000690514136|
|(34,[0,3,11,17,22...| 6.446381812222442| 6.272228072883408|
|(34,[0,3,11,17,22...|  6.41250489864426| 6.225051773963223|
|(34,[0,3,11,17,22...| 7.447158031342219| 6.540657859431391|
|(34,[0,3,11,17,22...|  6.48572142648158|6.2490450796397425|
|(34,[0,3,11,17,22...| 5.681246666022682| 6.146219925149836|
|(34,[0,3,11,17,22...| 6.477121254719663| 6.149350380181488|
|(34,[0,3,11,17,22...| 5.433369746856586| 6.145311684056459|
|(34,[0,3,11,17,22...|               5.0| 6.038052374201484|
|(34,[0,3,11,18,22...| 5.540329474790874| 6.256805556650214|
|(34,[0,3,11,18,22...|5.5136170737878745| 6.209681238301918|
|(34,[0,3,11,21,22...|6.4327809489278645|  6.01353280186072|
|(34,[0,3,12,17,22...|6.

#### Evaluation

In [27]:
# Mean Absolute Percentage Error (MAPE) custom evaluator
class MAPEEvaluator(Evaluator):
    def __init__(self, predictionCol="prediction", labelCol="label"):
        self.predictionCol = predictionCol
        self.labelCol = labelCol

    def _evaluate(self, dataset):
        """
        Calculates the Mean Absolute Percentage Error (MAPE) for regression tasks.
        """
        predictionAndLabels = dataset.select(self.predictionCol, self.labelCol).rdd.map(
            lambda row: (float(row[self.predictionCol]), float(row[self.labelCol]))
        )

        absolutePercentageErrors = predictionAndLabels.map(
            lambda x: abs((x[1] - x[0]) / x[1]) if x[1] != 0 else 1
        )

        mape = absolutePercentageErrors.mean()

        return mape

    def isLargerBetter(self):
        """
        Indicates whether a larger value of the metric is better.
        """
        return False


In [28]:
# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator1_mape = MAPEEvaluator(labelCol="label", predictionCol="prediction")

rmse_lr = evaluator1_rmse.evaluate(predictions)
r2_lr = evaluator1_r2.evaluate(predictions)
mape_lr = evaluator1_mape.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse_lr))
print("R2 on test data = {}".format(r2_lr))
print("Mean Absolute Percentage Error (MAPE) on test data = {}".format(mape_lr))

Root Mean Squared Error (RMSE) on test data = 0.5993494243127242
R2 on test data = 0.5176322394446695
Mean Absolute Percentage Error (MAPE) on test data = 0.07671823654798413


#### Hyperparameter optimization

In [29]:
grid = ParamGridBuilder()
grid = grid.addGrid(model_lr.aggregationDepth, [2, 3, 4]) \
                    .addGrid(model_lr.regParam, [0.0, 0.001, 0.1, 0.3, 0.5]) \
                    .build()

cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator1_mape,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

LinearRegressionModel: uid=LinearRegression_d8f7b6ca238d, numFeatures=34

#### Select the best model

In [30]:
model1 = bestModel
pprint(model1.extractParamMap())

{Param(parent='LinearRegression_d8f7b6ca238d', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LinearRegression_d8f7b6ca238d', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_d8f7b6ca238d', name='solver', doc='The solver algorithm for optimization. Supported options: auto, normal, l-bfgs.'): 'auto',
 Param(parent='LinearRegression_d8f7b6ca238d', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 1e-06,
 Param(parent='LinearRegression_d8f7b6ca238d', name='standardization', doc='whether to standardize the training features before fitting the model.'): True,
 Param(parent='LinearRegression_d8f7b6ca238d', name='maxBlockSizeInMB', doc='maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Mus

#### Save the model to HDFS

In [31]:
# model1.write().overwrite().save("project/models/model1")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model1 models/model1")

#### Prediction of the best model 1

In [32]:
predictions = model1.transform(test_data)
predictions.show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(34,[0,3,11,17,22...| 6.096910013008056| 6.360975218948255|
|(34,[0,3,11,17,22...| 6.643452676486188| 6.403264378348062|
|(34,[0,3,11,17,22...| 6.446381812222442| 6.272706751601412|
|(34,[0,3,11,17,22...|  6.41250489864426| 6.225256499506841|
|(34,[0,3,11,17,22...| 7.447158031342219| 6.538308897098318|
|(34,[0,3,11,17,22...|  6.48572142648158| 6.249977965607215|
|(34,[0,3,11,17,22...| 5.681246666022682| 6.145467515156838|
|(34,[0,3,11,17,22...| 6.477121254719663|  6.15008529305522|
|(34,[0,3,11,17,22...| 5.433369746856586| 6.144640756518086|
|(34,[0,3,11,17,22...|               5.0| 6.038132563124364|
|(34,[0,3,11,18,22...| 5.540329474790874| 6.257408768784842|
|(34,[0,3,11,18,22...|5.5136170737878745| 6.210689735226502|
|(34,[0,3,11,21,22...|6.4327809489278645| 6.013243855391906|
|(34,[0,3,12,17,22...|6.

In [33]:
# predictions.select("label", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model1_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv > output/model1_predictions.csv")

#### Evaluation of the best model 1

In [34]:
# Evaluate the performance of the best model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse1 = evaluator1_rmse.evaluate(predictions)
r2_1 = evaluator1_r2.evaluate(predictions)
mape1 = evaluator1_mape.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse1))
print("R2 on test data = {}".format(r2_1))
print("Mean Absolute Percentage Error on test data = {}".format(mape1))

Root Mean Squared Error (RMSE) on test data = 0.599693958342396
R2 on test data = 0.5170775050307348
Mean Absolute Percentage Error on test data = 0.07676012768649511


###  Modeling: Second model


Second model is Gradient-Boosted Tree Regression

#### Build a model

In [35]:
# Create Gradient-Boosted Tree regression Model
gbt = GBTRegressor()

# Fit the data to the model
model_gbt = gbt.fit(train_data)

#### Prediction

In [36]:
# Transform the data (Prediction)
predictions = model_gbt.transform(test_data)

# Display the predictions
predictions.show()

+--------------------+------------------+-----------------+
|            features|             label|       prediction|
+--------------------+------------------+-----------------+
|(34,[0,3,11,17,22...| 6.096910013008056|  6.6680415553615|
|(34,[0,3,11,17,22...| 6.643452676486188|6.597476503702703|
|(34,[0,3,11,17,22...| 6.446381812222442|6.481142767543754|
|(34,[0,3,11,17,22...|  6.41250489864426|6.172575002574308|
|(34,[0,3,11,17,22...| 7.447158031342219| 6.63546706672932|
|(34,[0,3,11,17,22...|  6.48572142648158|5.965375681646289|
|(34,[0,3,11,17,22...| 5.681246666022682|5.987523954970649|
|(34,[0,3,11,17,22...| 6.477121254719663|6.215433503464742|
|(34,[0,3,11,17,22...| 5.433369746856586|5.948941517682816|
|(34,[0,3,11,17,22...|               5.0|5.936341791161156|
|(34,[0,3,11,18,22...| 5.540329474790874|6.306289701080124|
|(34,[0,3,11,18,22...|5.5136170737878745|6.470168523043724|
|(34,[0,3,11,21,22...|6.4327809489278645|5.891534191974477|
|(34,[0,3,12,17,22...|6.6074549159814495

#### Evaluation

In [37]:
# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator2_mape = MAPEEvaluator(labelCol="label", predictionCol="prediction")

rmse_gbt = evaluator2_rmse.evaluate(predictions)
r2_gbt = evaluator2_r2.evaluate(predictions)
mape_gbt = evaluator2_mape.evaluate(predictions)


print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse_gbt))
print("R2 on test data = {}".format(r2_gbt))
print("Mean Absolute Percentage Error (MAPE) on test data = {}".format(mape_gbt))

Root Mean Squared Error (RMSE) on test data = 0.5750874156723099
R2 on test data = 0.5558948420775381
Mean Absolute Percentage Error (MAPE) on test data = 0.07287055902938512


#### Hyperparameter optimization

In [38]:
grid = ParamGridBuilder()
grid = (ParamGridBuilder().addGrid(model_gbt.maxDepth, [5, 10]) \
             .addGrid(model_gbt.lossType, ['absolute', 'squared']) \
             .build())

cv = CrossValidator(estimator=gbt, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator2_mape,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

GBTRegressionModel: uid=GBTRegressor_ef83434e5c5f, numTrees=20, numFeatures=34

#### Select the best model

In [39]:
model2 = bestModel
pprint(model2.extractParamMap())

{Param(parent='GBTRegressor_ef83434e5c5f', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,
 Param(parent='GBTRegressor_ef83434e5c5f', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='GBTRegressor_ef83434e5c5f', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',
 Param(parent='GBTRegressor_ef83434e5c5f', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the featu

#### Save the model to HDFS

In [40]:
# model2.write().overwrite().save("project/models/model2")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model2 models/model2")

#### Prediction of the best model 2

In [41]:
predictions = model2.transform(test_data)
predictions.show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(34,[0,3,11,17,22...| 6.096910013008056| 6.668041555361505|
|(34,[0,3,11,17,22...| 6.643452676486188| 6.597476503702703|
|(34,[0,3,11,17,22...| 6.446381812222442| 6.481142767543754|
|(34,[0,3,11,17,22...|  6.41250489864426|  6.17257500257431|
|(34,[0,3,11,17,22...| 7.447158031342219|  6.63546706672932|
|(34,[0,3,11,17,22...|  6.48572142648158|5.9653756816462895|
|(34,[0,3,11,17,22...| 5.681246666022682| 5.987523954970649|
|(34,[0,3,11,17,22...| 6.477121254719663| 6.215433503464744|
|(34,[0,3,11,17,22...| 5.433369746856586| 5.948941517682817|
|(34,[0,3,11,17,22...|               5.0| 5.936341791161157|
|(34,[0,3,11,18,22...| 5.540329474790874| 6.306289701080125|
|(34,[0,3,11,18,22...|5.5136170737878745| 6.470168523043724|
|(34,[0,3,11,21,22...|6.4327809489278645| 5.891534191974476|
|(34,[0,3,12,17,22...|6.

In [42]:
# predictions.select("label", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model2_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv > output/model2_predictions.csv")

#### Evaluation of the best model 2

In [43]:
# Evaluate the performance of the best model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator2_mape = MAPEEvaluator(labelCol="label", predictionCol="prediction")

rmse2 = evaluator2_rmse.evaluate(predictions)
r2_2 = evaluator2_r2.evaluate(predictions)
mape2 = evaluator2_mape.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R2 on test data = {}".format(r2_2))
print("Mean Absolute Percentage Error on test data = {}".format(mape2))

Root Mean Squared Error (RMSE) on test data = 0.575162710988388
R2 on test data = 0.555778542445909
Mean Absolute Percentage Error on test data = 0.07288239871189733


### Compare best models

In [44]:
# Create dataframe to report performance of the models
models = [[str(model1), rmse1, r2_1, mape1], [str(model2), rmse2, r2_2, mape2]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2", "MAPE"])
df.show(truncate=False)

+------------------------------------------------------------------------------+-----------------+------------------+-------------------+
|model                                                                         |RMSE             |R2                |MAPE               |
+------------------------------------------------------------------------------+-----------------+------------------+-------------------+
|LinearRegressionModel: uid=LinearRegression_d8f7b6ca238d, numFeatures=34      |0.599693958342396|0.5170775050307348|0.07676012768649511|
|GBTRegressionModel: uid=GBTRegressor_ef83434e5c5f, numTrees=20, numFeatures=34|0.575162710988388|0.555778542445909 |0.07288239871189733|
+------------------------------------------------------------------------------+-----------------+------------------+-------------------+



In [45]:
# # Save it to HDFS
# df.coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/evaluation.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")

## Stop spark

In [46]:
spark.stop()