<a href="https://colab.research.google.com/github/roitraining/SparkforDataEngineers/blob/Development/Ch05_RegressionAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initialize the spark environment and load the helper functions we have provided.

In [1]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt

from pyspark_helpers import display

initializing pyspark
pyspark initialized


### Read in a simple dataset of Boston Housing Prices.

In [2]:
#filename = 'avocado.csv'
#filename = 'HousingData.csv'
filename = 'boston.csv'
df = spark.read.csv(f'{datapath}/finance/{filename}', header = True, inferSchema = True)
display(df)
df.printSchema()

# Save a pointer to the raw data
dfRaw = df



AnalysisException: 'Path does not exist: file:/home/student/ROI/Spark/datasets/finance/boston.csv;'

In [0]:
from pyspark.ml.feature import StringIndexer
col = 'TOWN'
indexer = StringIndexer(inputCol = col, outputCol = col+'_Index')
x1 = indexer.fit(df).transform(df).select(col, col+'_Index').distinct()
display(x1.orderBy(col))
display(x1.orderBy(col+'_Index'))



In [0]:
x2 = pyh.StringIndexEncode(df, ['TOWN', 'TRACT'])
display(x2)


In [0]:
col = 'TOWN'
from pyspark.ml.feature import OneHotEncoderEstimator
encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
display(encoder.fit(x2).transform(x).orderBy(col + '_Index'))

x = pyh.OneHotEncode(x2, ['TOWN', 'TRACT'])
display (x)




In [0]:
%matplotlib inline
import pandas as pd
import seaborn as sns
#sns.distplot(df.toPandas()['MEDV'])

sns.distplot(df.where('MEDV < 48').toPandas()['MEDV'])
print(df.columns)

# If we want to filter out the outliers
dfRaw = dfRaw.where('MEDV < 48')

In [0]:
if filename == 'avocado.csv':
    df = dfRaw.withColumnRenamed('4046','PLU4046').withColumnRenamed('4225','PLU4225').withColumnRenamed('4770','PLU4770')
    df.createOrReplaceTempView('dfRaw')
    df.printSchema()

    sql = '''select AveragePrice as target, `Total Volume` as totalvolume
    , PLU4046, PLU4225, PLU4770
    , `Small Bags` as smallbags, `Large Bags` as largebags, `XLarge Bags` as xlargebags
    , type, year, region
    FROM dfRaw'''

    df = spark.sql(sql)
    print(df)

    numeric_features = ['totalvolume','PLU4046', 'PLU4225', 'PLU4770', 'smallbags', 'largebags', 'xlargebags']
    categorical_features = ['type', 'year','region']
    target_label = 'target'
    print(df.take(1))
else:
    numeric_features = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO']
    categorical_features = [] #['TOWN', 'TRACT']
    target_label = 'MEDV'
    df = dfRaw.select(categorical_features + numeric_features + [target_label])
    df.printSchema()

print ('******')
display(df.describe())

print ('******')
display(df)

### Turn the dataframe into vectors.



In [0]:
# import imp
# imp.reload(pyh)

# df10 = pyh.StringIndexEncode(df, categorical_features)
# display(df10)
# df11 = pyh.OneHotEncode(df10, categorical_features)
# display(df11)
# df12 = pyh.AssembleFeatures(df11, categorical_features, numeric_features, 'target', False)
# display(df12)

dfML = pyh.MakeMLDataFrame(df, categorical_features, numeric_features, target_label, False)
display(dfML)
dfML.printSchema()


### Split the dataset into train and test.

In [0]:
train, test = dfML.randomSplit([.7,.3], seed = 1000)
print (f'Training set row count {train.count()}')
print (f'Testing set row count {test.count()}')



### Run Linear Regression.

In [0]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='target', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

print("Root Mean Squared Error: {}\nR Squared (R2) {}".format(lrModel.summary.rootMeanSquaredError, lrModel.summary.r2))


### Run test data.


In [0]:
lrPredictions = lrModel.transform(test)
display(lrPredictions.select("prediction","target","features"), 30)
from pyspark.ml.evaluation import RegressionEvaluator
lrEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="target",metricName="r2")
testResult = lrModel.evaluate(test)
print("Root Mean Squared Error on Test set: {}".format(testResult.rootMeanSquaredError))

### Try Decision Tree Regression.

In [0]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'target')
dtModel = dt.fit(train)
dtPredictions = dtModel.transform(test)
important = dtModel.featureImportances
print(type(important), important)
#importantDict = zip(important[1], important[2])
#print (importantDict)
print (important[3])
from pyspark.ml.evaluation import RegressionEvaluator
dtEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="target",metricName="rmse")
testResult = dtEvaluator.evaluate(dtPredictions)
print("Root Mean Squared Error: {}".format(testResult))
dfML.take(1)

### Try Gradient Boosted Tree.

In [0]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'target', maxIter=10)
gbtModel = gbt.fit(train)
gbtPredictions = gbtModel.transform(test)
display(gbtPredictions.select('prediction', 'target', 'features'), 20)
from pyspark.ml.evaluation import RegressionEvaluator
gbtEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="target",metricName="rmse")
testResult = gbtEvaluator.evaluate(gbtPredictions)
print("Root Mean Squared Error: {}".format(testResult))
