# Predictive Data Analytics

In [1]:
import os

from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, split, when, sin, cos
from pyspark.sql.types import IntegerType, StringType
from pyspark import keyword_only
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Tokenizer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np
import math
from pprint import pprint

## Read Hive tables

### Connect to Hive

In [2]:
# Add here your team number teamx
team = "team13"

# Location of Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder \
        .appName("{} - spark ML".format(team)) \
        .master("yarn") \
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883") \
        .config("spark.sql.warehouse.dir", warehouse) \
        .config("spark.sql.avro.compression.codec", "snappy") \
        .config("spark.executor.instances", 8) \
        .config("spark.executor.cores", 1) \
        .config("spark.executor.memory", "2g") \
        .config("spark.dynamicAllocation.enabled", "false") \
        .enableHiveSupport() \
        .getOrCreate()

In [3]:
spark

### List all databases

In [4]:
spark.sql("SHOW DATABASES;").show()

+--------------------+
|           namespace|
+--------------------+
|             default|
|             root_db|
|     team0_projectdb|
|team12_hive_proje...|
|    team13_projectdb|
|    team14_projectdb|
|    team15_projectdb|
|    team16_projectdb|
|    team17_projectdb|
|    team18_projectdb|
|    team19_projectdb|
|     team1_projectdb|
|    team20_projectdb|
|    team21_projectdb|
|    team22_projectdb|
|    team23_projectdb|
|    team24_projectdb|
|    team25_projectdb|
|    team26_projectdb|
|    team27_projectdb|
+--------------------+
only showing top 20 rows



### List all tables

In [5]:
print(spark.catalog.listTables("team13_projectdb"))

[Table(name='acquisitions', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='degrees', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funding_rounds', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funding_rounds_part', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funds', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='investments', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='ipos', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='milestones', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='objects', database='team13_projectdb', description=None, tableType='EXTER

### Read Hive tables

In [6]:
objects = spark.read.format("avro").table('team13_projectdb.objects_part')

fund_rounds = spark.read.format("avro").table('team13_projectdb.funding_rounds_part')

## ML Modeling

### Preprocessing the data

#### Feature selection


In [7]:
# Select features and the label
obj_features = ['id', 'status', 'category_code', 'country_code', 'investment_rounds', 'invested_companies', 'milestones', 'relationships']

fund_features = ['object_id', 'funded_at', 'funding_round_type', 'participants', 'is_first_round', 'is_last_round']
label = 'raised_amount_usd'

In [8]:
objects = objects.select(obj_features)
fund_rounds = fund_rounds.select(fund_features + [label])

In [9]:
# Join tables to form one Dataframe for the ML task
final = objects.join(fund_rounds, objects['id'] == fund_rounds['object_id'], how='right').drop('id').drop('object_id')

In [10]:
quantiles = final.approxQuantile("raised_amount_usd", [0.25, 0.75], 0)

bottom = quantiles[0]
top = quantiles[1]

final = final.filter((final['raised_amount_usd'] >= bottom) & (final['raised_amount_usd'] <= top))


In [11]:
leisure = ['games_video', 'photo_video', 'social', 'hospitality', 'sports', 'fashion', 'messaging', 'music']
bizsupport = ['network_hosting', 'advertising', 'enterprise', 'consulting', 'analytics', 'public_relations', 'security', 'legal']
building = ['cleantech', 'manufacturing', 'semiconductor', 'automotive', 'real_eastate', 'nanotech']
petcare = ['pets']
travel = ['travel', 'transportation']
health = ['health', 'medical', 'biotech']
other = ['web', 'other', 'mobile', 'software', 'finance', 'education', 'ecommerce', 'search', 'hardware', 'news', 'government', 'nonprofit', 'local']

@udf(returnType=StringType())
def map_category_code(category_code):
    if category_code in leisure:
        return 'leisure'
    elif category_code in bizsupport:
        return 'bizsupport'
    elif category_code in building:
        return 'building'
    elif category_code in petcare:
        return 'petcare'
    elif category_code in travel:
        return 'travel'
    elif category_code in health:
        return 'health'
    else:
        return 'other'

final = final.withColumn('category_code', map_category_code(final['category_code']))

In [12]:
Africa = ['AGO', 'BDI', 'BEN', 'BWA', 'CIV', 'CMR', 'DZA', 'EGY', 'ETH', 'GHA', 'GIN', 'KEN', 'LSO', 'MAR', 'MDG', 'MUS', 'NAM', 'NER','NGA', 'REU','RWA', 'SDN','SEN', 'SLE', 'SOM','SWZ', 'SYC', 'TUN', 'TZA', 'UGA', 'ZAF', 'ZMB', 'ZWE']
Asia = ['AFG', 'ARE', 'BGD', 'BHR', 'BRN', 'CHN', 'HKG', 'IDN', 'IND', 'IOT', 'IRN', 'IRQ', 'ISR','JOR', 'JPN', 'KAZ', 'KGZ', 'KHM', 'KOR', 'KWT','LAO', 'LBN', 'LKA', 'MAC', 'MDV', 'MMR', 'MYS', 'NPL', 'OMN', 'PAK', 'PCN','PHL','PRK','PST', 'QAT', 'SAU', 'SGP','SYR', 'THA', 'TJK', 'TWN', 'UZB', 'VNM', 'YEM']
Europe = ['AIA', 'ALB', 'AND', 'ARM', 'AUT', 'AZE', 'BEL', 'BGR','BIH', 'BLR', 'CHE', 'CYP', 'CZE', 'DEU', 'DNK','ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GEO', 'GIB', 'GLB', 'GRC', 'HRV', 'HUN', 'IRL', 'ISL', 'ITA', 'LIE', 'LTU','LUX', 'LVA', 'MCO', 'MDA', 'MKD', 'MLT', 'NLD', 'NOR', 'POL', 'PRT', 'ROM', 'RUS', 'SMR', 'SVK', 'SVN','SWE', 'TUR', 'UKR']
North_America = ['ATG', 'BHS','BLZ', 'BMU', 'BRB', 'CAN', 'CRI','CUB','CYM', 'DMA', 'GRD', 'GTM', 'HND', 'HTI', 'JAM', 'MEX', 'MTQ', 'PAN', 'PRI', 'SLV', 'UMI','USA', 'VGB', 'VIR']
South_America = ['ARG', 'BOL', 'BRA', 'CHL', 'COL', 'DOM', 'ECU', 'NIC', 'PER', 'PRY', 'SUR', 'TTO', 'URY','VEN', 'VCT']
Other = ['ANT', 'ARA', 'AUS', 'CSS', 'FST', 'HMI','NCL', 'NFK','NRU', 'NZL']

@udf(returnType=StringType())
def map_country(country_code):
    if country_code in Africa:
        return 'Africa'
    elif country_code in Asia:
        return 'Asia'
    elif country_code in Europe:
        return 'Europe'
    elif country_code in North_America:
        return 'North_America'
    elif country_code in South_America:
        return 'South_America'
    else:
        return 'Other'
    
    
final = final.withColumn('country_code', map_country(final['country_code']))

In [13]:
# Split funded_at with datetime to year, month and day
split_col = F.split(F.to_date("funded_at"), "-")
final = final.withColumn("funded_year", split_col.getItem(0).cast(IntegerType())) \
            .withColumn("funded_month", split_col.getItem(1).cast(IntegerType())) \
            .withColumn("funded_day", split_col.getItem(2).cast(IntegerType()))
# Remove funded_at
final = final.drop("funded_at")

# Drop all records which contain nulls
final = final.na.drop()

# Convert raised_amount_usd (label) to mln usd
# final = final.withColumn("raised_amount_usd", col("raised_amount_usd")/1000000)
final = final.withColumn("raised_amount_usd", F.log10(col("raised_amount_usd")))
                        

final = final.na.fill(0, "raised_amount_usd")

# Rename label
final = final.withColumnRenamed("raised_amount_usd", "label")

# Saving intermediate results
final = final.cache()

In [14]:
print("Size of the DataFrame: {} rows, {} columns".format(final.count(), len(final.columns)))

Size of the DataFrame: 26259 rows, 15 columns


In [15]:
final.printSchema()

root
 |-- status: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- investment_rounds: integer (nullable = true)
 |-- invested_companies: integer (nullable = true)
 |-- milestones: integer (nullable = true)
 |-- relationships: integer (nullable = true)
 |-- funding_round_type: string (nullable = true)
 |-- participants: integer (nullable = true)
 |-- is_first_round: integer (nullable = true)
 |-- is_last_round: integer (nullable = true)
 |-- label: double (nullable = false)
 |-- funded_year: integer (nullable = true)
 |-- funded_month: integer (nullable = true)
 |-- funded_day: integer (nullable = true)



In [16]:
final.show()

+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+------------------+-----------+------------+----------+
|   status|category_code| country_code|investment_rounds|invested_companies|milestones|relationships|funding_round_type|participants|is_first_round|is_last_round|             label|funded_year|funded_month|funded_day|
+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+------------------+-----------+------------+----------+
|   closed|        other|North_America|                0|                 0|         3|            4|          series-a|           0|             1|            0| 6.361727836017593|       2011|           4|        25|
|operating|        other|North_America|                0|                 0|         2|           14|          series-a|        

#### Building the Pipeline

In [17]:
# Extract categorical, numerical and cyclical features
categorical_cols = ['status', 'funding_round_type', 'category_code', 'country_code']

numerical_cols = ['investment_rounds', 'invested_companies', 'milestones', 'relationships', 'funded_year', 'participants', 'is_first_round', 'is_last_round']

cyclical_cols = ['funded_month', 'funded_day']
periods = [12, 31]  # periods for months and days accordingly

In [18]:
# Build a custom tranformer to encode cyclical features
class CyclicTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString)
    outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString)
  
    @keyword_only
    def __init__(self, inputCol: str = "input", outputCol: str = "output", period: int = 12):
        super(CyclicTransformer, self).__init__()
        self._setDefault(inputCol=None, outputCol=None)
        kwargs = self._input_kwargs
        del(kwargs["period"])
        self.set_params(**kwargs)
        self.period = period
    
    @keyword_only
    def set_params(self, inputCol: str = "input", outputCol: str = "output"):
        kwargs = self._input_kwargs
        self._set(**kwargs)
    
    def getInputCol(self):
        return self.getOrDefault(self.inputCol)
  
    def getOutputCol(self):
        return self.getOrDefault(self.outputCol)
  
    def _transform(self, df: DataFrame):
        input_col = self.getInputCol()
        output_col = self.getOutputCol()
        
        sin_col = sin(2 * math.pi * df[input_col] / self.period) 
        cos_col = cos(2 * math.pi * df[input_col] / self.period)
       
        return df.withColumn(output_col + "_sin", sin_col).withColumn(output_col + "_cos", cos_col)

In [19]:
# Create String indexer to assign index for the string fields where each unique string will get a unique index
indexers = [StringIndexer(inputCol=c, 
                           outputCol="{0}_indexed".format(c)).setHandleInvalid("skip") for c in categorical_cols]

# Encode strings using One Hot encoding
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), 
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]

# Encode cyclical features using custom Cyclic Transformer
cyclic_transformers = [CyclicTransformer(inputCol=col, 
                                         outputCol=col + "_cyc_enc", 
                                         period=period) for col, period in zip(cyclical_cols, periods)]

# This will concatenate the input cols into a single column
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + 
                            [transformer.getOutputCol() + '_sin' for transformer in cyclic_transformers] + 
                            [transformer.getOutputCol() + '_cos' for transformer in cyclic_transformers] + 
                            numerical_cols, 
                            outputCol= "features")

# scaler = StandardScaler(inputCol="features_unscaled", outputCol="features")

# Create a pipeline to use only a single fit and transform on the data
pipeline = Pipeline(stages=indexers + encoders + cyclic_transformers + [assembler])

# Fit the pipeline ==> This will call the fit functions for all transformers if exist
model = pipeline.fit(final)

# Fit the pipeline ==> This will call the transform functions for all transformers
data = model.transform(final)

data.show()

+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+------------------+-----------+------------+----------+--------------+--------------------------+---------------------+--------------------+----------------------+----------------------------------+-----------------------------+----------------------------+------------------------+------------------------+----------------------+----------------------+--------------------+
|   status|category_code| country_code|investment_rounds|invested_companies|milestones|relationships|funding_round_type|participants|is_first_round|is_last_round|             label|funded_year|funded_month|funded_day|status_indexed|funding_round_type_indexed|category_code_indexed|country_code_indexed|status_indexed_encoded|funding_round_type_indexed_encoded|category_code_indexed_encoded|country_code_indexed_encoded|funded_month_cyc_enc_sin|funded_month_cyc_en

In [20]:
# Delete all features and keep only the features and label columns
data = data.select(["features", "label"])

data.show()

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|(34,[2,4,11,17,22...| 6.361727836017593|
|(34,[0,4,11,17,22...| 5.954242509439325|
|(34,[0,7,11,17,22...| 6.301029995663981|
|(34,[0,8,11,17,22...| 6.477121254719663|
|(34,[0,7,11,17,22...| 6.544068044350276|
|(34,[0,4,11,17,22...| 6.477121254719663|
|(34,[0,4,11,17,22...|6.3226513062053655|
|(34,[0,3,13,20,22...| 5.999999565705301|
|(34,[0,5,13,20,22...|               6.0|
|(34,[0,3,13,20,22...|   5.8750612633917|
|(34,[0,5,14,18,22...| 5.477121254719663|
|(34,[0,5,14,18,22...| 5.477121254719663|
|(34,[0,3,11,17,22...|5.4913630947819545|
|(34,[0,3,11,17,22...| 5.778151250383644|
|(34,[0,7,13,18,22...| 6.663092864465058|
|(34,[2,7,14,17,22...| 6.698970004336019|
|(34,[0,5,14,17,22...| 5.989004615698537|
|(34,[0,5,12,17,22...| 6.096910013008056|
|(34,[0,3,12,17,22...|5.7877416476056585|
|(34,[0,5,15,17,22...| 5.477121254719663|
+--------------------+------------

#### Split the dataset

In [21]:
# Split the data into 70% training and 30% test (it is not stratified)
(train_data, test_data) = data.randomSplit([0.7, 0.3], seed=10)

In [22]:
print("Size of training data: ({}, {})".format(train_data.count(), len(train_data.columns)))
print("Size of test data: ({}, {})".format(test_data.count(), len(test_data.columns)))

Size of training data: (18343, 2)
Size of test data: (7916, 2)


In [23]:
# Check the train test split ratio
print("Split ratio: {}, {}".format(round(train_data.count() / data.count(), 2), round(test_data.count() / data.count(), 2)))

Split ratio: 0.7, 0.3


In [24]:
# # A function to run commands
# def run(command):
#     return os.popen(command).read()

# train_data.select("features", "label")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/train")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/data/train/*.json > data/train.json")

# test_data.select("features", "label")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/test")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/data/test/*.json > data/test.json")

###  Modeling: First model

First model is Linear Regression

#### Build a model

In [25]:
# Create Linear Regression Model
lr = LinearRegression(maxIter=250, loss = "huber")

# Fit the data to the lr model
model_lr = lr.fit(train_data)

#### Prediction

In [26]:
# Transform the data (Prediction)
predictions = model_lr.transform(test_data)

predictions = predictions.withColumn("label", F.pow(10, "label"))
predictions = predictions.withColumn("prediction", F.pow(10, "prediction"))


# Display the predictions
predictions.show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|(34,[0,3,11,17,22...| 4400000.000000005|1793648.9556791098|
|(34,[0,3,11,17,22...|2794999.9999999986|1476224.7873354186|
|(34,[0,3,11,17,22...|3059999.9999999977|1531657.2088653855|
|(34,[0,3,11,17,22...| 480006.0000000001| 1413245.765755045|
|(34,[0,3,11,17,22...| 4999999.999999999|1500059.4492883375|
|(34,[0,3,11,17,22...| 702679.0000000006|1501051.6832866487|
|(34,[0,3,11,17,22...|1150000.0000000002|1311050.2668687566|
|(34,[0,3,11,17,22...| 600000.0000000003|1511968.7845707606|
|(34,[0,3,11,21,22...| 2708825.000000002|1359345.5196104718|
|(34,[0,3,13,17,22...| 530000.0000000001|  1872705.93126128|
|(34,[0,3,13,17,22...| 2638999.999999999|1787338.6203485902|
|(34,[0,3,13,17,22...|         1000000.0|1674777.5296831373|
|(34,[0,3,13,18,22...| 5118944.999999998|1930027.0689808875|
|(34,[0,3,13,18,22...|14

#### Evaluation

In [27]:
from pyspark.ml.evaluation import Evaluator
from pyspark.ml.linalg import Vector
from pyspark.sql import Row

class MAPEEvaluator(Evaluator):
    def __init__(self, predictionCol="prediction", labelCol="label"):
        self.predictionCol = predictionCol
        self.labelCol = labelCol

    def _evaluate(self, dataset):
        """
        Calculates the Mean Absolute Percentage Error (MAPE) for regression tasks.
        """
        predictionAndLabels = dataset.select(self.predictionCol, self.labelCol).rdd.map(
            lambda row: (float(row[self.predictionCol]), float(row[self.labelCol]))
        )

        absolutePercentageErrors = predictionAndLabels.map(
            lambda x: abs((x[1] - x[0]) / x[1]) if x[1] !=0 else 1
        )

        mape = absolutePercentageErrors.mean()

        return mape

    def isLargerBetter(self):
        """
        Indicates whether a larger value of the metric is better.
        """
        return False


In [28]:
# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator1_mape = MAPEEvaluator(labelCol="label", predictionCol="prediction")

rmse_lr = evaluator1_rmse.evaluate(predictions)
r2_lr = evaluator1_r2.evaluate(predictions)
mape_lr = evaluator1_mape.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse_lr))
print("R2 on test data = {}".format(r2_lr))
print("Mean Absolute Percentage Error on test data = {}".format(mape_lr))

Root Mean Squared Error (RMSE) on test data = 1562668.6209236805
R2 on test data = 0.23395541908638073
Mean Absolute Percentage Error on test data = 0.7771505664070809


#### Hyperparameter optimization

In [None]:
grid = ParamGridBuilder()
grid = grid.addGrid(model_lr.aggregationDepth, [2, 3, 4]) \
                    .addGrid(model_lr.regParam, [0.0, 0.001, 0.1, 0.3, 0.5]) \
                    .build()

cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator1_mape,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

#### Select the best model

In [None]:
model1 = bestModel
pprint(model1.extractParamMap())

#### Save the model to HDFS

In [None]:
# model1.write().overwrite().save("project/models/model1")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model1 models/model1")

#### Prediction of the best model 1

In [None]:
predictions = model1.transform(test_data)
predictions.show()

In [None]:
# predictions.select("label", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model1_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv > output/model1_predictions.csv")

#### Evaluation of the best model 1

In [None]:
# Evaluate the performance of the best model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse1 = evaluator1_rmse.evaluate(predictions)
r2_1 = evaluator1_r2.evaluate(predictions)
mape1 = evaluator1_mape.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse1))
print("R2 on test data = {}".format(r2_1))
print("Mean Absolute Percentage Error on test data = {}".format(mape1))

###  Modeling: Second model


Second model is Gradient-Boosted Tree Regression

#### Build a model

In [None]:
# Create Gradient-Boosted Tree regression Model
gbt = GBTRegressor()

# Fit the data to the model
model_gbt = gbt.fit(train_data)

#### Prediction

In [None]:
# Transform the data (Prediction)
predictions = model_gbt.transform(test_data)

# Display the predictions
predictions.show()

#### Evaluation

In [None]:
# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse_gbt = evaluator2_rmse.evaluate(predictions)
r2_gbt = evaluator2_r2.evaluate(predictions)
mape_gbt = evaluator1_mape.evaluate(predictions)


print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse_gbt))
print("R2 on test data = {}".format(r2_gbt))
print("Mean Absolute Percentage Error on test data = {}".format(mape_gbt))


#### Hyperparameter optimization

In [None]:
grid = ParamGridBuilder()
grid = (ParamGridBuilder().addGrid(model_gbt.maxDepth, [5, 10, 15, 20]) \
             .addGrid(model_gbt.lossType, ['absolute', 'squared']) \
             .build())

cv = CrossValidator(estimator=gbt, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator1_mape,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

#### Select the best model

In [None]:
model2 = bestModel
pprint(model2.extractParamMap())

#### Save the model to HDFS

In [None]:
# model2.write().overwrite().save("project/models/model2")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model2 models/model2")

#### Prediction of the best model 2

In [None]:
predictions = model2.transform(test_data)
predictions.show()

In [None]:
# predictions.select("label", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model2_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv > output/model2_predictions.csv")

#### Evaluation of the best model 2

In [None]:
# Evaluate the performance of the best model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r2_2 = evaluator2_r2.evaluate(predictions)
mape2 = evaluator1_mape.evaluate(predictions)


print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R2 on test data = {}".format(r2_2))
print("Mean Absolute Percentage Error on test data = {}".format(mape2))


### Compare best models

In [None]:
# Create dataframe to report performance of the models
models = [[str(model1), rmse1, r2_1, mape1], [str(model2), rmse2, r2_2, mape2]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2", "MAPE"])
df.show(truncate=False)

In [None]:
# # Save it to HDFS
# df.coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/evaluation.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")

## Stop spark

In [None]:
spark.stop()