## AWS CLI code for initializing cluster:

aws emr create-cluster --auto-scaling-role EMR_AutoScaling_DefaultRole --applications Name=Hive Name=JupyterHub Name=Ganglia Name=Spark --ebs-root-volume-size 10 --ec2-attributes '{"KeyName":"aws-key1","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-05f8732b","EmrManagedSlaveSecurityGroup":"sg-0314eaf4b5c5d861d","EmrManagedMasterSecurityGroup":"sg-0314eaf4b5c5d861d"}' --service-role EMR_DefaultRole --enable-debugging --release-label emr-5.19.0 --log-uri 's3n://aws-logs-253161286339-us-east-1/elasticmapreduce/' --name 'lsdm_cluster_ps3_vf' --instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"Master - 1"},{"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"Core - 2"}]' --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 --bootstrap-actions Path=s3://lsdmghosevf/lsdm_ps3_bash.sh

In [1]:
#Referenced: https://www.quora.com/How-would-I-decide-create-a-Spark-cluster-infrastructure-given-the-size-and-frequency-of-data-that-I-get

#Explanation for cluster configuration decisions: We know that the file sizes of the three parquet data sources
#used in this assignment are od (14.3GB), rac (2GB), and wac (787MB) -- for a combined total of 17.087GB.  In order to
#factor in additional space for replication (2x), as well as for applications installed on the cluster like Spark, I decided to
#separate 25% of available space on the cluster.  This means that I need (17.087Gb*2)+ .25(17.087Gb*2) = 42.718Gb.  Since
#each m5.xlarge machine has 16Gb of memory, I meed the ceiling of 41.718GB/16GB = 3 machines.  With this in mind, I chose to use 1 master
#node and 2 core machines for this assignment.

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
5,application_1543175043386_0006,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import lower, col
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

In [3]:
spark = SparkSession.builder.appName("PySparkExploration").config('spark.executor.instances', '3').config('spark.executor.memory', '16G').config('spark.executor.cores', '3').getOrCreate()


In [4]:
od = spark.read.option('header', 'true').option('inferSchema', 'true').parquet('s3://lsdm-emr-util/lsdm-data/lodes/od/od.parquet')

In [5]:
rac = spark.read.option('header', 'true').option('inferSchema', 'true').parquet('s3://lsdm-emr-util/lsdm-data/lodes/rac/rac.parquet')

In [6]:
wac = spark.read.option('header', 'true').option('inferSchema', 'true').parquet('s3://lsdm-emr-util/lsdm-data/lodes/wac/wac.parquet')

In [7]:
od.show(1)

+---------------+---------------+----+----+----+----+----+----+----+----+----+----+----------+----+
|      w_geocode|      h_geocode|s000|sa01|sa02|sa03|se01|se02|se03|si01|si02|si03|createdate|year|
+---------------+---------------+----+----+----+----+----+----+----+----+----+----+----------+----+
|271630714002025|271630712082020|   1|   0|   1|   0|   0|   1|   0|   1|   0|   0|  20160219|2012|
+---------------+---------------+----+----+----+----+----+----+----+----+----+----+----------+----+
only showing top 1 row

In [8]:
rac.show(1)

+---------------+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----------+----+
|      h_geocode|c000|ca01|ca02|ca03|ce01|ce02|ce03|cns01|cns02|cns03|cns04|cns05|cns06|cns07|cns08|cns09|cns10|cns11|cns12|cns13|cns14|cns15|cns16|cns17|cns18|cns19|cns20|cr01|cr02|cr03|cr04|cr05|cr07|ct01|ct02|cd01|cd02|cd03|cd04|cs01|cs02|createdate|year|
+---------------+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----------+----+
|260010001001000|   4|   0|   4|   0|   1|   1|   2|    0|    0|    0|    0|    0|    0|    0|    1|    1|    0|    0|    0|    0|    1|    0|    1|    0|    0|    0|    0|   4|   0|   0|   0|   0|   0|   4|   0|   0|   1| 

In [9]:
wac.show(1)

+---------------+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----------+----+
|      w_geocode|c000|ca01|ca02|ca03|ce01|ce02|ce03|cns01|cns02|cns03|cns04|cns05|cns06|cns07|cns08|cns09|cns10|cns11|cns12|cns13|cns14|cns15|cns16|cns17|cns18|cns19|cns20|cr01|cr02|cr03|cr04|cr05|cr07|ct01|ct02|cd01|cd02|cd03|cd04|cs01|cs02|cfa01|cfa02|cfa03|cfa04|cfa05|cfs01|cfs02|cfs03|cfs04|cfs05|createdate|year|
+---------------+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----------+----+
|060014001001007|  32|  11|  10|  11|  12| 

In [10]:
od.columns

['w_geocode', 'h_geocode', 's000', 'sa01', 'sa02', 'sa03', 'se01', 'se02', 'se03', 'si01', 'si02', 'si03', 'createdate', 'year']

In [11]:
rac.columns

['h_geocode', 'c000', 'ca01', 'ca02', 'ca03', 'ce01', 'ce02', 'ce03', 'cns01', 'cns02', 'cns03', 'cns04', 'cns05', 'cns06', 'cns07', 'cns08', 'cns09', 'cns10', 'cns11', 'cns12', 'cns13', 'cns14', 'cns15', 'cns16', 'cns17', 'cns18', 'cns19', 'cns20', 'cr01', 'cr02', 'cr03', 'cr04', 'cr05', 'cr07', 'ct01', 'ct02', 'cd01', 'cd02', 'cd03', 'cd04', 'cs01', 'cs02', 'createdate', 'year']

In [12]:
wac.columns

['w_geocode', 'c000', 'ca01', 'ca02', 'ca03', 'ce01', 'ce02', 'ce03', 'cns01', 'cns02', 'cns03', 'cns04', 'cns05', 'cns06', 'cns07', 'cns08', 'cns09', 'cns10', 'cns11', 'cns12', 'cns13', 'cns14', 'cns15', 'cns16', 'cns17', 'cns18', 'cns19', 'cns20', 'cr01', 'cr02', 'cr03', 'cr04', 'cr05', 'cr07', 'ct01', 'ct02', 'cd01', 'cd02', 'cd03', 'cd04', 'cs01', 'cs02', 'cfa01', 'cfa02', 'cfa03', 'cfa04', 'cfa05', 'cfs01', 'cfs02', 'cfs03', 'cfs04', 'cfs05', 'createdate', 'year']

In [13]:
od = od.withColumn('w_tract', od.w_geocode.substr(1,11)) 
od = od.withColumn('h_tract', od.h_geocode.substr(1,11)) 
rac = rac.withColumn('h_tract', rac.h_geocode.substr(0,11))
wac = wac.withColumn('w_tract', wac.w_geocode.substr(0,11))

In [14]:
od.createOrReplaceTempView("od")
rac.createOrReplaceTempView("rac")
wac.createOrReplaceTempView("wac")

In [15]:
merged_df = spark.sql("SELECT * from od left join wac on wac.w_tract = od.w_tract")

In [16]:
merged_df.createOrReplaceTempView("merged_df")

In [17]:
final_df = spark.sql("SELECT * from merged_df left join rac on rac.h_tract = merged_df.h_tract")

In [18]:
final_df.columns

['w_geocode', 'h_geocode', 's000', 'sa01', 'sa02', 'sa03', 'se01', 'se02', 'se03', 'si01', 'si02', 'si03', 'createdate', 'year', 'w_tract', 'h_tract', 'w_geocode', 'c000', 'ca01', 'ca02', 'ca03', 'ce01', 'ce02', 'ce03', 'cns01', 'cns02', 'cns03', 'cns04', 'cns05', 'cns06', 'cns07', 'cns08', 'cns09', 'cns10', 'cns11', 'cns12', 'cns13', 'cns14', 'cns15', 'cns16', 'cns17', 'cns18', 'cns19', 'cns20', 'cr01', 'cr02', 'cr03', 'cr04', 'cr05', 'cr07', 'ct01', 'ct02', 'cd01', 'cd02', 'cd03', 'cd04', 'cs01', 'cs02', 'cfa01', 'cfa02', 'cfa03', 'cfa04', 'cfa05', 'cfs01', 'cfs02', 'cfs03', 'cfs04', 'cfs05', 'createdate', 'year', 'w_tract', 'h_geocode', 'c000', 'ca01', 'ca02', 'ca03', 'ce01', 'ce02', 'ce03', 'cns01', 'cns02', 'cns03', 'cns04', 'cns05', 'cns06', 'cns07', 'cns08', 'cns09', 'cns10', 'cns11', 'cns12', 'cns13', 'cns14', 'cns15', 'cns16', 'cns17', 'cns18', 'cns19', 'cns20', 'cr01', 'cr02', 'cr03', 'cr04', 'cr05', 'cr07', 'ct01', 'ct02', 'cd01', 'cd02', 'cd03', 'cd04', 'cs01', 'cs02', 'cre

In [19]:
weights = [.8, .2]
seed = 30
training_set, test_set = final_df.randomSplit(weights, seed)

In [20]:
feature_cols = ['cns01', 'cns02', 'cns03', 'cns04', 'cns05', 'cns06', 'cns07', 'cns08', 'cns09', 'cns10', 'cns11', 'cns12', 'cns13', 'cns14']
label = 's000'

In [21]:
vAssemble = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
rac_a = vAssemble.transform(rac)
rac_a.take(5)

[Row(h_geocode='260010001001000', c000=4, ca01=0, ca02=4, ca03=0, ce01=1, ce02=1, ce03=2, cns01=0, cns02=0, cns03=0, cns04=0, cns05=0, cns06=0, cns07=0, cns08=1, cns09=1, cns10=0, cns11=0, cns12=0, cns13=0, cns14=1, cns15=0, cns16=1, cns17=0, cns18=0, cns19=0, cns20=0, cr01=4, cr02=0, cr03=0, cr04=0, cr05=0, cr07=0, ct01=4, ct02=0, cd01=0, cd02=1, cd03=2, cd04=1, cs01=3, cs02=1, createdate='20170919', year=2015, h_tract='26001000100', features=SparseVector(14, {7: 1.0, 8: 1.0, 13: 1.0})), Row(h_geocode='260010001001004', c000=2, ca01=0, ca02=2, ca03=0, ce01=1, ce02=1, ce03=0, cns01=0, cns02=0, cns03=0, cns04=0, cns05=0, cns06=0, cns07=0, cns08=0, cns09=0, cns10=1, cns11=0, cns12=0, cns13=0, cns14=0, cns15=0, cns16=0, cns17=0, cns18=1, cns19=0, cns20=0, cr01=2, cr02=0, cr03=0, cr04=0, cr05=0, cr07=0, ct01=2, ct02=0, cd01=1, cd02=1, cd03=0, cd04=0, cs01=1, cs02=1, createdate='20170919', year=2015, h_tract='26001000100', features=SparseVector(14, {9: 1.0})), Row(h_geocode='260010001001005

In [22]:
from sklearn.ensemble import LinearRegression

lr = LinearRegression(labelCol=label, featuresCol = feature_cols)
paramGrid_lr = ParamGridBuilder().addGrid(lr.solver, ["l-bfgs", "normal"]).build()
evaluator_lr = RegressionEvaluator(predictionCol = "prediction_lr", labelCol = label, metricName = "rmse")
cv_lr = CrossValidator(estimator = lr, estimatorParamMaps = paramGrid_lr, evaluator = evaluator_lr, numFolds = 5)
cvModel_lr = cv.fit(training_set)
score_lr = cvModel_lr.predict(training_set)
cvModel_lr.save('s3n://aws-logs-253161286339-us-east-1')

cannot import name 'LinearRegression'
Traceback (most recent call last):
ImportError: cannot import name 'LinearRegression'



In [23]:
from sklearn import RandomForestRegressor

rf = RandomForestRegressor(labelCol =label, featuresCol = feature_cols)
paramGrid_rf = (ParamGridBuilder().addGrid(lr.maxDepth, [3,5,7]).addGrid(lr.featureSubsetStrategy, ["all","sqrt", "log2"]).addGrid(lr.numTrees, [50,200]).build())
evaluator_rf = RegressionEvaluator(predictionCol = "prediction_rf", labelCol = label, metricName = "rmse")
cv_rf = CrossValidator(estimator = rf, estimatorParamMaps = paramGrid_rf, evaluator = evaluator_rf, numFolds = 5)
cvModel_rf = cv_rf.fit(training_set)
score_rf = cvModel_rf.predict(training_set)
cvModel_rf.save('s3n://aws-logs-253161286339-us-east-1')

cannot import name 'RandomForestRegressor'
Traceback (most recent call last):
ImportError: cannot import name 'RandomForestRegressor'



In [24]:
from sklearn import GradientBoostingRegressor

gbr = GradientBoostingRegressor(labelCol =label, featuresCol = feature_cols)
paramGrid_gbr = (ParamGridBuilder().addGrid().build())
evaluator_gbr = RegressionEvaluator(predictionCol = "prediction_gbr", labelCol = label, metricName = "rmse")
cv_gbr = CrossValidator(estimator = gbr, estimatorParamMaps = paramGrid_gbr, evaluator = evaluator_gbr, numFolds = 5)
cvModel_gbr = cv_gbr.fit(training_set)
score_gbr = cvModel_gbr.predict(training_set)
cvModel_gbr.save('s3n://aws-logs-253161286339-us-east-1')

cannot import name 'GradientBoostingRegressor'
Traceback (most recent call last):
ImportError: cannot import name 'GradientBoostingRegressor'

