# Predictive Data Analytics

In [1]:
import os

from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, split, when, sin, cos
from pyspark.sql.types import IntegerType, StringType
from pyspark import keyword_only
from pyspark.ml import Transformer, Pipeline
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Tokenizer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np
import math
from pprint import pprint

## Read Hive tables

### Connect to Hive

In [2]:
# Add here your team number teamx
team = "team13"

# Location of Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder \
        .appName("{} - spark ML".format(team)) \
        .master("yarn") \
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883") \
        .config("spark.sql.warehouse.dir", warehouse) \
        .config("spark.sql.avro.compression.codec", "snappy") \
        .config("spark.executor.instances", 8) \
        .config("spark.executor.cores", 1) \
        .config("spark.executor.memory", "2g") \
        .config("spark.dynamicAllocation.enabled", "false") \
        .enableHiveSupport() \
        .getOrCreate()

In [3]:
spark

### List all databases

In [4]:
spark.sql("SHOW DATABASES;").show()

+--------------------+
|           namespace|
+--------------------+
|             default|
|             root_db|
|     team0_projectdb|
|team12_hive_proje...|
|    team13_projectdb|
|    team14_projectdb|
|    team15_projectdb|
|    team16_projectdb|
|    team17_projectdb|
|    team18_projectdb|
|    team19_projectdb|
|     team1_projectdb|
|    team20_projectdb|
|    team21_projectdb|
|    team22_projectdb|
|    team23_projectdb|
|    team24_projectdb|
|    team25_projectdb|
|    team26_projectdb|
|    team27_projectdb|
+--------------------+
only showing top 20 rows



### List all tables

In [5]:
print(spark.catalog.listTables("team13_projectdb"))

[Table(name='acquisitions', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='degrees', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funding_rounds', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funding_rounds_part', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='funds', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='investments', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='ipos', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='milestones', database='team13_projectdb', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='objects', database='team13_projectdb', description=None, tableType='EXTER

### Read Hive tables

In [6]:
objects = spark.read.format("avro").table('team13_projectdb.objects_part')

fund_rounds = spark.read.format("avro").table('team13_projectdb.funding_rounds_part')

## ML Modeling

### Preprocessing the data

#### Feature selection


In [7]:
# Select features and the label
obj_features = ['id', 'status', 'category_code', 'country_code', 'investment_rounds', 'invested_companies', 'milestones', 'relationships']

fund_features = ['object_id', 'funded_at', 'funding_round_type', 'participants', 'is_first_round', 'is_last_round']
label = 'raised_amount_usd'

In [8]:
objects = objects.select(obj_features)
fund_rounds = fund_rounds.select(fund_features + [label])

In [9]:
# Join tables to form one Dataframe for the ML task
final = objects.join(fund_rounds, objects['id'] == fund_rounds['object_id'], how='right').drop('id').drop('object_id')

In [10]:
leisure = ['games_video', 'photo_video', 'social', 'hospitality', 'sports', 'fashion', 'messaging', 'music']
bizsupport = ['network_hosting', 'advertising', 'enterprise', 'consulting', 'analytics', 'public_relations', 'security', 'legal']
building = ['cleantech', 'manufacturing', 'semiconductor', 'automotive', 'real_eastate', 'nanotech']
petcare = ['pets']
travel = ['travel', 'transportation']
health = ['health', 'medical', 'biotech']
other = ['web', 'other', 'mobile', 'software', 'finance', 'education', 'ecommerce', 'search', 'hardware', 'news', 'government', 'nonprofit', 'local']

@udf(returnType=StringType())
def map_category_code(category_code):
    if category_code in leisure:
        return 'leisure'
    elif category_code in bizsupport:
        return 'bizsupport'
    elif category_code in building:
        return 'building'
    elif category_code in petcare:
        return 'petcare'
    elif category_code in travel:
        return 'travel'
    elif category_code in health:
        return 'health'
    else:
        return 'other'

final = final.withColumn('category_code', map_category_code(final['category_code']))

In [11]:
Africa = ['AGO', 'BDI', 'BEN', 'BWA', 'CIV', 'CMR', 'DZA', 'EGY', 'ETH', 'GHA', 'GIN', 'KEN', 'LSO', 'MAR', 'MDG', 'MUS', 'NAM', 'NER','NGA', 'REU','RWA', 'SDN','SEN', 'SLE', 'SOM','SWZ', 'SYC', 'TUN', 'TZA', 'UGA', 'ZAF', 'ZMB', 'ZWE']
Asia = ['AFG', 'ARE', 'BGD', 'BHR', 'BRN', 'CHN', 'HKG', 'IDN', 'IND', 'IOT', 'IRN', 'IRQ', 'ISR','JOR', 'JPN', 'KAZ', 'KGZ', 'KHM', 'KOR', 'KWT','LAO', 'LBN', 'LKA', 'MAC', 'MDV', 'MMR', 'MYS', 'NPL', 'OMN', 'PAK', 'PCN','PHL','PRK','PST', 'QAT', 'SAU', 'SGP','SYR', 'THA', 'TJK', 'TWN', 'UZB', 'VNM', 'YEM']
Europe = ['AIA', 'ALB', 'AND', 'ARM', 'AUT', 'AZE', 'BEL', 'BGR','BIH', 'BLR', 'CHE', 'CYP', 'CZE', 'DEU', 'DNK','ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GEO', 'GIB', 'GLB', 'GRC', 'HRV', 'HUN', 'IRL', 'ISL', 'ITA', 'LIE', 'LTU','LUX', 'LVA', 'MCO', 'MDA', 'MKD', 'MLT', 'NLD', 'NOR', 'POL', 'PRT', 'ROM', 'RUS', 'SMR', 'SVK', 'SVN','SWE', 'TUR', 'UKR']
North_America = ['ATG', 'BHS','BLZ', 'BMU', 'BRB', 'CAN', 'CRI','CUB','CYM', 'DMA', 'GRD', 'GTM', 'HND', 'HTI', 'JAM', 'MEX', 'MTQ', 'PAN', 'PRI', 'SLV', 'UMI','USA', 'VGB', 'VIR']
South_America = ['ARG', 'BOL', 'BRA', 'CHL', 'COL', 'DOM', 'ECU', 'NIC', 'PER', 'PRY', 'SUR', 'TTO', 'URY','VEN', 'VCT']
Other = ['ANT', 'ARA', 'AUS', 'CSS', 'FST', 'HMI','NCL', 'NFK','NRU', 'NZL']

@udf(returnType=StringType())
def map_country(country_code):
    if country_code in Africa:
        return 'Africa'
    elif country_code in Asia:
        return 'Asia'
    elif country_code in Europe:
        return 'Europe'
    elif country_code in North_America:
        return 'North_America'
    elif country_code in South_America:
        return 'South_America'
    else:
        return 'Other'
    
    
final = final.withColumn('country_code', map_country(final['country_code']))

In [12]:
# Split funded_at with datetime to year, month and day
split_col = F.split(F.to_date("funded_at"), "-")
final = final.withColumn("funded_year", split_col.getItem(0).cast(IntegerType())) \
            .withColumn("funded_month", split_col.getItem(1).cast(IntegerType())) \
            .withColumn("funded_day", split_col.getItem(2).cast(IntegerType()))
# Remove funded_at
final = final.drop("funded_at")

# Drop all records which contain nulls
final = final.na.drop()

# Convert raised_amount_usd (label) to mln usd
final = final.withColumn("raised_amount_usd", col("raised_amount_usd") / 1000000)

# Rename label
final = final.withColumnRenamed("raised_amount_usd", "label")

# Saving intermediate results
final = final.cache()

In [13]:
print("Size of the DataFrame: {} rows, {} columns".format(final.count(), len(final.columns)))

Size of the DataFrame: 52229 rows, 15 columns


In [14]:
final.printSchema()

root
 |-- status: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- investment_rounds: integer (nullable = true)
 |-- invested_companies: integer (nullable = true)
 |-- milestones: integer (nullable = true)
 |-- relationships: integer (nullable = true)
 |-- funding_round_type: string (nullable = true)
 |-- participants: integer (nullable = true)
 |-- is_first_round: integer (nullable = true)
 |-- is_last_round: integer (nullable = true)
 |-- label: decimal(27,10) (nullable = true)
 |-- funded_year: integer (nullable = true)
 |-- funded_month: integer (nullable = true)
 |-- funded_day: integer (nullable = true)



In [15]:
final.show()

+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+-------------+-----------+------------+----------+
|   status|category_code| country_code|investment_rounds|invested_companies|milestones|relationships|funding_round_type|participants|is_first_round|is_last_round|        label|funded_year|funded_month|funded_day|
+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+-------------+-----------+------------+----------+
|   closed|        other|North_America|                0|                 0|         3|            4|          series-a|           0|             1|            0| 2.3000000000|       2011|           4|        25|
|   closed|        other|North_America|                0|                 0|         3|            4|             angel|           2|             0|

#### Building the Pipeline

In [23]:
# Extract categorical, numerical and cyclical features
categorical_cols = ['status', 'funding_round_type', 'category_code', 'country_code']

numerical_cols = ['investment_rounds', 'invested_companies', 'milestones', 'relationships', 'funded_year', 'participants', 'is_first_round', 'is_last_round']

cyclical_cols = ['funded_month', 'funded_day']
periods = [12, 31]  # periods for months and days accordingly

In [17]:
# Build a custom tranformer to encode cyclical features
class CyclicTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString)
    outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString)
  
    @keyword_only
    def __init__(self, inputCol: str = "input", outputCol: str = "output", period: int = 12):
        super(CyclicTransformer, self).__init__()
        self._setDefault(inputCol=None, outputCol=None)
        kwargs = self._input_kwargs
        del(kwargs["period"])
        self.set_params(**kwargs)
        self.period = period
    
    @keyword_only
    def set_params(self, inputCol: str = "input", outputCol: str = "output"):
        kwargs = self._input_kwargs
        self._set(**kwargs)
    
    def getInputCol(self):
        return self.getOrDefault(self.inputCol)
  
    def getOutputCol(self):
        return self.getOrDefault(self.outputCol)
  
    def _transform(self, df: DataFrame):
        input_col = self.getInputCol()
        output_col = self.getOutputCol()
        
        sin_col = sin(2 * math.pi * df[input_col] / self.period) 
        cos_col = cos(2 * math.pi * df[input_col] / self.period)
       
        return df.withColumn(output_col + "_sin", sin_col).withColumn(output_col + "_cos", cos_col)

In [18]:
# Create String indexer to assign index for the string fields where each unique string will get a unique index
indexers = [StringIndexer(inputCol=c, 
                           outputCol="{0}_indexed".format(c)).setHandleInvalid("skip") for c in categorical_cols]

# Encode strings using One Hot encoding
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), 
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers]

# Encode cyclical features using custom Cyclic Transformer
cyclic_transformers = [CyclicTransformer(inputCol=col, 
                                         outputCol=col + "_cyc_enc", 
                                         period=period) for col, period in zip(cyclical_cols, periods)]

# This will concatenate the input cols into a single column
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + 
                            [transformer.getOutputCol() + '_sin' for transformer in cyclic_transformers] + 
                            [transformer.getOutputCol() + '_cos' for transformer in cyclic_transformers] + 
                            numerical_cols, 
                            outputCol= "features")

# scaler = StandardScaler(inputCol="features_unscaled", outputCol="features")

# Create a pipeline to use only a single fit and transform on the data
pipeline = Pipeline(stages=indexers + encoders + cyclic_transformers + [assembler])

# Fit the pipeline ==> This will call the fit functions for all transformers if exist
model = pipeline.fit(final)

# Fit the pipeline ==> This will call the transform functions for all transformers
data = model.transform(final)

data.show()

+---------+-------------+-------------+-----------------+------------------+----------+-------------+------------------+------------+--------------+-------------+-------------+-----------+------------+----------+--------------+--------------------------+---------------------+--------------------+----------------------+----------------------------------+-----------------------------+----------------------------+------------------------+------------------------+----------------------+----------------------+--------------------+
|   status|category_code| country_code|investment_rounds|invested_companies|milestones|relationships|funding_round_type|participants|is_first_round|is_last_round|        label|funded_year|funded_month|funded_day|status_indexed|funding_round_type_indexed|category_code_indexed|country_code_indexed|status_indexed_encoded|funding_round_type_indexed_encoded|category_code_indexed_encoded|country_code_indexed_encoded|funded_month_cyc_enc_sin|funded_month_cyc_enc_cos|fund

In [19]:
# Delete all features and keep only the features and label columns
data = data.select(["features", "label"])

data.show()

+--------------------+-------------+
|            features|        label|
+--------------------+-------------+
|(34,[2,5,11,17,22...| 2.3000000000|
|(34,[2,4,11,17,22...|        0E-10|
|(34,[1,3,12,17,22...|        0E-10|
|(34,[0,5,13,17,22...| 7.0000000000|
|(34,[0,5,11,17,22...| 0.9000000000|
|(34,[0,6,11,17,22...| 2.0000000000|
|(34,[0,8,11,17,22...| 3.0000000000|
|(34,[0,5,11,17,22...| 3.0000000000|
|(34,[0,6,11,17,22...| 8.0000000000|
|(34,[0,6,11,17,22...| 3.5000000000|
|(34,[0,3,11,17,22...| 8.0000000000|
|(34,[0,5,11,17,22...| 2.1020900000|
|(34,[0,4,11,17,22...| 0.1000000000|
|(34,[2,4,12,18,22...| 0.1050000000|
|(34,[0,5,13,17,22...|15.0000000000|
|(34,[0,4,11,19,22...|        0E-10|
|(34,[0,5,11,17,22...|        0E-10|
|(34,[2,6,13,17,22...| 8.5000000000|
|(34,[2,6,13,17,22...|15.0000000000|
|(34,[0,4,13,20,22...| 1.0000000000|
+--------------------+-------------+
only showing top 20 rows



#### Split the dataset

In [20]:
# Split the data into 70% training and 30% test (it is not stratified)
(train_data, test_data) = data.randomSplit([0.7, 0.3], seed=10)

In [21]:
print("Size of training data: ({}, {})".format(train_data.count(), len(train_data.columns)))
print("Size of test data: ({}, {})".format(test_data.count(), len(test_data.columns)))

Size of training data: (36460, 2)
Size of test data: (15769, 2)


In [22]:
# Check the train test split ratio
print("Split ratio: {}, {}".format(round(train_data.count() / data.count(), 2), round(test_data.count() / data.count(), 2)))

Split ratio: 0.7, 0.3


In [None]:
# # A function to run commands
# def run(command):
#     return os.popen(command).read()

# train_data.select("features", "label")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/train")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/data/train/*.json > data/train.json")

# test_data.select("features", "label")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/test")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/data/test/*.json > data/test.json")

###  Modeling: First model

First model is Linear Regression

#### Build a model

In [24]:
# Create Linear Regression Model
lr = LinearRegression(maxIter=250)

# Fit the data to the lr model
model_lr = lr.fit(train_data)

#### Prediction

In [25]:
# Transform the data (Prediction)
predictions = model_lr.transform(test_data)

# Display the predictions
predictions.show()

+--------------------+-------------+--------------------+
|            features|        label|          prediction|
+--------------------+-------------+--------------------+
|(34,[0,3,11,17,22...| 1.2500000000|   4.450078221186459|
|(34,[0,3,11,17,22...|40.0000000000|  10.305779043113603|
|(34,[0,3,11,17,22...| 8.0000000000|   6.331969867822863|
|(34,[0,3,11,17,22...| 2.7950000000|   8.831164450182314|
|(34,[0,3,11,17,22...| 2.5852640000|   2.496692786101903|
|(34,[0,3,11,17,22...| 1.3199740000|   13.65231427517034|
|(34,[0,3,11,17,22...|25.0000000000|  -2.731071441293693|
|(34,[0,3,11,17,22...| 0.4800060000|-0.30817864484254187|
|(34,[0,3,11,17,22...| 0.3100010000|   5.042158698533797|
|(34,[0,3,11,17,22...| 0.1500000000|   4.751481596699193|
|(34,[0,3,11,17,22...| 0.1000000000|  0.6481926057722376|
|(34,[0,3,11,18,22...| 0.3470000000|  6.3700463791461175|
|(34,[0,3,11,18,22...| 0.3263000000|   7.269774491042995|
|(34,[0,3,11,18,22...|80.1570140000|   4.995470566619758|
|(34,[0,3,13,1

#### Evaluation

In [26]:
# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse_lr = evaluator1_rmse.evaluate(predictions)
r2_lr = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse_lr))
print("R2 on test data = {}".format(r2_lr))

Root Mean Squared Error (RMSE) on test data = 23.769198762181027
R2 on test data = 0.17353995478896167


#### Hyperparameter optimization

In [27]:
grid = ParamGridBuilder()
grid = grid.addGrid(model_lr.aggregationDepth, [2, 3, 4]) \
                    .addGrid(model_lr.regParam, [0.0, 0.001, 0.1, 0.3, 0.5]) \
                    .build()

cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator1_rmse,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

LinearRegressionModel: uid=LinearRegression_fa2df6ece99d, numFeatures=34

#### Select the best model

In [28]:
model1 = bestModel
pprint(model1.extractParamMap())

{Param(parent='LinearRegression_fa2df6ece99d', name='solver', doc='The solver algorithm for optimization. Supported options: auto, normal, l-bfgs.'): 'auto',
 Param(parent='LinearRegression_fa2df6ece99d', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_fa2df6ece99d', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='LinearRegression_fa2df6ece99d', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_fa2df6ece99d', name='maxIter', doc='max number of iterations (>= 0).'): 250,
 Param(parent='LinearRegression_fa2df6ece99d', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LinearRegression_fa2df6ece99d', name='maxBlockSizeInMB', doc='maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optim

#### Save the model to HDFS

In [None]:
# model1.write().overwrite().save("project/models/model1")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model1 models/model1")

#### Prediction of the best model 1

In [29]:
predictions = model1.transform(test_data)
predictions.show()

+--------------------+-------------+--------------------+
|            features|        label|          prediction|
+--------------------+-------------+--------------------+
|(34,[0,3,11,17,22...| 1.2500000000|   4.449536679012681|
|(34,[0,3,11,17,22...|40.0000000000|  10.305291647766353|
|(34,[0,3,11,17,22...| 8.0000000000|   6.324792571430976|
|(34,[0,3,11,17,22...| 2.7950000000|   8.831711397652896|
|(34,[0,3,11,17,22...| 2.5852640000|  2.4955240623440886|
|(34,[0,3,11,17,22...| 1.3199740000|  13.653201238547183|
|(34,[0,3,11,17,22...|25.0000000000| -2.7314351865177855|
|(34,[0,3,11,17,22...| 0.4800060000|-0.30929365122580066|
|(34,[0,3,11,17,22...| 0.3100010000|   5.045413269133633|
|(34,[0,3,11,17,22...| 0.1500000000|  4.7505448794170775|
|(34,[0,3,11,17,22...| 0.1000000000|  0.6479267478707129|
|(34,[0,3,11,18,22...| 0.3470000000|    6.36636132447012|
|(34,[0,3,11,18,22...| 0.3263000000|   7.271672006424353|
|(34,[0,3,11,18,22...|80.1570140000|   4.995332009232072|
|(34,[0,3,13,1

In [None]:
# predictions.select("label", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model1_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv > output/model1_predictions.csv")

#### Evaluation of the best model 1

In [30]:
# Evaluate the performance of the best model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse1 = evaluator1_rmse.evaluate(predictions)
r2_1 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse1))
print("R2 on test data = {}".format(r2_1))

Root Mean Squared Error (RMSE) on test data = 23.76832434111929
R2 on test data = 0.17360076127929258


###  Modeling: Second model


Second model is Gradient-Boosted Tree Regression

#### Build a model

In [31]:
# Create Gradient-Boosted Tree regression Model
gbt = GBTRegressor()

# Fit the data to the model
model_gbt = gbt.fit(train_data)

#### Prediction

In [32]:
# Transform the data (Prediction)
predictions = model_gbt.transform(test_data)

# Display the predictions
predictions.show()

+--------------------+-------------+------------------+
|            features|        label|        prediction|
+--------------------+-------------+------------------+
|(34,[0,3,11,17,22...| 1.2500000000|6.9999021471457565|
|(34,[0,3,11,17,22...|40.0000000000| 9.392001244870666|
|(34,[0,3,11,17,22...| 8.0000000000| 9.100785134241058|
|(34,[0,3,11,17,22...| 2.7950000000| 6.857432630115672|
|(34,[0,3,11,17,22...| 2.5852640000| 6.676230948566066|
|(34,[0,3,11,17,22...| 1.3199740000| 6.207123199332354|
|(34,[0,3,11,17,22...|25.0000000000|3.8320913063676363|
|(34,[0,3,11,17,22...| 0.4800060000| 2.909473879517752|
|(34,[0,3,11,17,22...| 0.3100010000|4.1800395483435935|
|(34,[0,3,11,17,22...| 0.1500000000|  6.26826904691352|
|(34,[0,3,11,17,22...| 0.1000000000|3.2088145667610615|
|(34,[0,3,11,18,22...| 0.3470000000| 5.754474626454675|
|(34,[0,3,11,18,22...| 0.3263000000| 5.981548302474395|
|(34,[0,3,11,18,22...|80.1570140000| 4.909962062695975|
|(34,[0,3,13,17,22...| 9.4000000000|2.7930275361

#### Evaluation

In [33]:
# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse_gbt = evaluator2_rmse.evaluate(predictions)
r2_gbt = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse_gbt))
print("R2 on test data = {}".format(r2_gbt))

Root Mean Squared Error (RMSE) on test data = 43.78642231815086
R2 on test data = -1.8046049812648466


#### Hyperparameter optimization

In [None]:
grid = ParamGridBuilder()
grid = (ParamGridBuilder().addGrid(model_gbt.maxDepth, [5, 10, 15, 20]) \
             .addGrid(model_gbt.lossType, ['absolute', 'squared']) \
             .build())

cv = CrossValidator(estimator=gbt, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator2_rmse,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

#### Select the best model

In [None]:
model2 = bestModel
pprint(model2.extractParamMap())

#### Save the model to HDFS

In [None]:
# model2.write().overwrite().save("project/models/model2")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model2 models/model2")

#### Prediction of the best model 2

In [None]:
predictions = model2.transform(test_data)
predictions.show()

In [None]:
# predictions.select("label", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model2_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv > output/model2_predictions.csv")

#### Evaluation of the best model 2

In [None]:
# Evaluate the performance of the best model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r2_2 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R2 on test data = {}".format(r2_2))

### Compare best models

In [None]:
# Create dataframe to report performance of the models
models = [[str(model1), rmse1, r2_1], [str(model2), rmse2, r2_2]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2"])
df.show(truncate=False)

In [None]:
# # Save it to HDFS
# df.coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/evaluation.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")

## Stop spark

In [35]:
spark.stop()