In [0]:
%sh curl -O 'https://raw.githubusercontent.com/bsullins/bensullins.com-freebies/master/CogsleyServices-SalesData-US.csv'
# saves file to file:/databricks/driver/CogsleyServices-SalesData-US.csv

In [0]:
path = 'file:/databricks/driver/CogsleyServices-SalesData-US.csv'
# path = "/databricks-datasets/samples/population-vs-price/data_geo.csv"

# Use the Spark CSV datasource with options specifying:
# - First line of file is a header
# - Automatically infer the schema of the data
data = sqlContext.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load(path)
 
data.cache() # Cache data for faster reuse
data = data.dropna() # drop rows with missing values
 
# Register table so it is accessible via SQL Context
# For Apache Spark = 2.0
# data.createOrReplaceTempView("data_geo")

display(data)

RowID,OrderID,OrderDate,OrderMonthYear,Quantity,Quote,DiscountPct,Rate,SaleAmount,CustomerName,CompanyName,Sector,Industry,City,ZipCode,State,Region,ProjectCompleteDate,DaystoComplete,ProductKey,ProductCategory,ProductSubCategory,Consultant,Manager,HourlyWage,RowCount,WageMargin
1914,13729,2009-01-01,2009-01-01,9,1800,0.08,200,1640.96,Matt Bertelsons,The Priceline Group Inc.,Miscellaneous,Business Services,Bowie,20715,Maryland,East,2009-01-03,2,Development - Big Data,Development,Python,Noah Smith,Allen Young,59,1,0.71
4031,28774,2009-01-01,2009-01-01,32,6400,0.1,200,5707.67,Jessica Thornton,Garmin Ltd.,Capital Goods,Industrial Machinery/Components,McKeesport,15131,Pennsylvania,East,2009-01-02,1,Development - Big Data,Development,Market Research,Daniel Tusk,Allen Young,45,1,0.78
1279,9285,2009-01-02,2009-01-01,3,480,0.06,160,447.11,David O'Rourke,Wynn Resorts Limited,Consumer Services,Hotels/Resorts,Prior Lake,55372,Minnesota,Central,2009-01-04,2,Development - Java,Development,Python,Mason Gibson,Josh Martinez,71,1,0.56
5272,37537,2009-01-02,2009-01-01,4,500,0.0,125,495.47,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-02,0,Training - Development,Training,Java,William Bufont,Bob Turner,62,1,0.5
5273,37537,2009-01-02,2009-01-01,43,5375,0.07,125,4953.46,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-04,2,Training - Development,Training,Strategy,Liam Franklin,Bob Turner,52,1,0.58
5274,37537,2009-01-02,2009-01-01,32,6400,0.05,200,6024.92,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-09,7,Development - Big Data,Development,.Net,Emma Watson,Bob Turner,67,1,0.67
6224,44069,2009-01-02,2009-01-01,16,1760,0.09,110,1587.09,Elizabeth Hansen,Fastenal Company,Consumer Services,RETAIL: Building Materials,Montebello,90640,California,West,2009-01-04,2,Development - Python,Development,Business Model,Sophia Dixon,Bob Turner,71,1,0.35
6225,44069,2009-01-02,2009-01-01,43,4730,0.08,110,4312.18,Elizabeth Hansen,Fastenal Company,Consumer Services,RETAIL: Building Materials,Montebello,90640,California,West,2009-01-02,0,Development - Python,Development,SQL,Mia Moore,Bob Turner,51,1,0.54
1074,7909,2009-01-03,2009-01-01,29,3480,0.03,120,3345.1,Alex Grayson,C.H. Robinson Worldwide Inc.,Transportation,Oil Refining/Marketing,Lake Oswego,97035,Oregon,West,2009-01-04,1,Development - Business Logic,Development,Market Research,Abigail Young,Bob Turner,50,1,0.58
1315,9637,2009-01-03,2009-01-01,12,1800,0.08,150,1641.04,Andy Willingham,DIRECTV,Consumer Services,Telecommunications Equipment,Baton Rouge,70802,Louisiana,South,2009-01-05,2,Consulting - Business Model,Consulting,Java,Madison Hill,Frank Mitchell,58,1,0.61


In [0]:
# Get monthly sales totals
summary = data.select("OrderMonthYear", "SaleAmount").groupBy("OrderMonthYear").sum().orderBy("OrderMonthYear").toDF("OrderMonthYear","SaleAmount")

# Convert OrderMonthYear to integer type
results = summary.rdd.map(lambda r: (int(r.OrderMonthYear.replace('-','')), r.SaleAmount)).toDF(["OrderMonthYear","SaleAmount"])


In [0]:
display(results)

OrderMonthYear,SaleAmount
20090101,734559.3599999996
20090201,539887.7999999998
20090301,559449.9600000002
20090401,614983.31
20090501,637481.3899999998
20090601,555516.2199999999
20090701,670807.13
20090801,660353.6399999995
20090901,648992.75
20091001,570492.5099999999


In [0]:
# Convert DataFrame to Features and Labels

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['OrderMonthYear'], outputCol = 'features')
df = vectorAssembler.transform(results)
df = df.select(['features', 'SaleAmount'])

df.show(2)

In [0]:
# Import LinearRegression class
from pyspark.ml.regression import LinearRegression

# # Define LinearRegression algorithm
lr = LinearRegression(featuresCol='features', labelCol='SaleAmount')
 
# # Fit 2 models, using different regularization parameters
modelA = lr.fit(df, {lr.regParam:0.0})
modelB = lr.fit(df, {lr.regParam:100.0})

# # Make predictions
predictionsA = modelA.transform(df)
predictionsB = modelB.transform(df)

In [0]:
display(predictionsA)

features,SaleAmount,prediction
"Map(vectorType -> dense, length -> 1, values -> List(2.0090101E7))",734559.3599999996,604138.9286303297
"Map(vectorType -> dense, length -> 1, values -> List(2.0090201E7))",539887.7999999998,604174.6104237037
"Map(vectorType -> dense, length -> 1, values -> List(2.0090301E7))",559449.9600000002,604210.2922170786
"Map(vectorType -> dense, length -> 1, values -> List(2.0090401E7))",614983.31,604245.9740104526
"Map(vectorType -> dense, length -> 1, values -> List(2.0090501E7))",637481.3899999998,604281.6558038266
"Map(vectorType -> dense, length -> 1, values -> List(2.0090601E7))",555516.2199999999,604317.3375972006
"Map(vectorType -> dense, length -> 1, values -> List(2.0090701E7))",670807.13,604353.0193905747
"Map(vectorType -> dense, length -> 1, values -> List(2.0090801E7))",660353.6399999995,604388.7011839487
"Map(vectorType -> dense, length -> 1, values -> List(2.0090901E7))",648992.75,604424.3829773227
"Map(vectorType -> dense, length -> 1, values -> List(2.0091001E7))",570492.5099999999,604460.0647706967


In [0]:
display(predictionsB)

features,SaleAmount,prediction
"Map(vectorType -> dense, length -> 1, values -> List(2.0090101E7))",734559.3599999996,604146.8266577646
"Map(vectorType -> dense, length -> 1, values -> List(2.0090201E7))",539887.7999999998,604182.4576599654
"Map(vectorType -> dense, length -> 1, values -> List(2.0090301E7))",559449.9600000002,604218.0886621661
"Map(vectorType -> dense, length -> 1, values -> List(2.0090401E7))",614983.31,604253.7196643669
"Map(vectorType -> dense, length -> 1, values -> List(2.0090501E7))",637481.3899999998,604289.3506665677
"Map(vectorType -> dense, length -> 1, values -> List(2.0090601E7))",555516.2199999999,604324.9816687685
"Map(vectorType -> dense, length -> 1, values -> List(2.0090701E7))",670807.13,604360.6126709701
"Map(vectorType -> dense, length -> 1, values -> List(2.0090801E7))",660353.6399999995,604396.2436731709
"Map(vectorType -> dense, length -> 1, values -> List(2.0090901E7))",648992.75,604431.8746753717
"Map(vectorType -> dense, length -> 1, values -> List(2.0091001E7))",570492.5099999999,604467.5056775725


In [0]:
# Check Models for Accuracy

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', labelCol="SaleAmount")

RMSE = evaluator.evaluate(predictionsA)
print('ModelA: Root Mean Squared Error = {}'.format(RMSE))

RMSE = evaluator.evaluate(predictionsB)
print('ModelB: Root Mean Squared Error = {}'.format(RMSE))

In [0]:
# Create Tables with Predictions

# define column names
cols = ['OrderMonthYear', 'SaleAmount', 'Prediction']

# use parallelize to create RDD
# use map() with lambda to parse features
tableA = sc.parallelize(\
                       predictionsA.rdd.map(lambda r: (float(r.features[0]), r.SaleAmount, r.prediction)).collect()\
                       ).toDF(cols)

                                                
tableB = sc.parallelize(\
                       predictionsB.rdd.map(lambda r: (float(r.features[0]), r.SaleAmount, r.prediction)).collect()\
                       ).toDF(cols)
                        
# save results as Tables
tableA.write.saveAsTable('predictionsA', mode='overwrite')
print('predictionsA table is created!')

tableB.write.saveAsTable('predictionsB', mode='overwrite')
print('predictionsB table is created!')

In [0]:
%sql
SELECT a.OrderMonthYear, a.SaleAmount, a.Prediction AS model_A, b.Prediction AS model_B
FROM predictionsa a
JOIN predictionsb b ON a.OrderMonthYear = b.OrderMonthYear;

OrderMonthYear,SaleAmount,model_A,model_B
20090701.0,670807.13,604353.0193905747,604360.6126709701
20090801.0,660353.6399999995,604388.7011839487,604396.2436731709
20090901.0,648992.75,604424.3829773227,604431.8746753717
20091001.0,570492.5099999999,604460.0647706967,604467.5056775725
20091101.0,566171.2399999998,604495.7465640707,604503.1366797732
20091201.0,560232.9200000002,604531.4283574447,604538.767681974
20110101.0,495309.26000000007,611275.2873051474,611273.0270979404
20110201.0,580321.9899999998,611310.9690985214,611308.6581001412
20110301.0,568968.4600000002,611346.6508918954,611344.289102342
20110401.0,634585.7599999999,611382.3326852694,611379.9201045427
