In [1]:
%sh curl -O 'https://raw.githubusercontent.com/bsullins/bensullins.com-freebies/master/CogsleyServices-SalesData-US.csv'
# saves file to file:/databricks/driver/CogsleyServices-SalesData-US.csv

In [2]:
path = 'file:/databricks/driver/CogsleyServices-SalesData-US.csv'
# path = "/databricks-datasets/samples/population-vs-price/data_geo.csv"

# Use the Spark CSV datasource with options specifying:
# - First line of file is a header
# - Automatically infer the schema of the data
data = sqlContext.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load(path)
 
data.cache() # Cache data for faster reuse
data = data.dropna() # drop rows with missing values
 
# Register table so it is accessible via SQL Context
# For Apache Spark = 2.0
# data.createOrReplaceTempView("data_geo")

display(data)

RowID,OrderID,OrderDate,OrderMonthYear,Quantity,Quote,DiscountPct,Rate,SaleAmount,CustomerName,CompanyName,Sector,Industry,City,ZipCode,State,Region,ProjectCompleteDate,DaystoComplete,ProductKey,ProductCategory,ProductSubCategory,Consultant,Manager,HourlyWage,RowCount,WageMargin
1914,13729,2009-01-01T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,9,1800,0.08,200,1640.96,Matt Bertelsons,The Priceline Group Inc.,Miscellaneous,Business Services,Bowie,20715,Maryland,East,2009-01-03T00:00:00.000+0000,2,Development - Big Data,Development,Python,Noah Smith,Allen Young,59,1,0.71
4031,28774,2009-01-01T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,32,6400,0.1,200,5707.67,Jessica Thornton,Garmin Ltd.,Capital Goods,Industrial Machinery/Components,McKeesport,15131,Pennsylvania,East,2009-01-02T00:00:00.000+0000,1,Development - Big Data,Development,Market Research,Daniel Tusk,Allen Young,45,1,0.78
1279,9285,2009-01-02T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,3,480,0.06,160,447.11,David O'Rourke,Wynn Resorts Limited,Consumer Services,Hotels/Resorts,Prior Lake,55372,Minnesota,Central,2009-01-04T00:00:00.000+0000,2,Development - Java,Development,Python,Mason Gibson,Josh Martinez,71,1,0.56
5272,37537,2009-01-02T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,4,500,0.0,125,495.47,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-02T00:00:00.000+0000,0,Training - Development,Training,Java,William Bufont,Bob Turner,62,1,0.5
5273,37537,2009-01-02T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,43,5375,0.07,125,4953.46,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-04T00:00:00.000+0000,2,Training - Development,Training,Strategy,Liam Franklin,Bob Turner,52,1,0.58
5274,37537,2009-01-02T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,32,6400,0.05,200,6024.92,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-09T00:00:00.000+0000,7,Development - Big Data,Development,.Net,Emma Watson,Bob Turner,67,1,0.67
6224,44069,2009-01-02T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,16,1760,0.09,110,1587.09,Elizabeth Hansen,Fastenal Company,Consumer Services,RETAIL: Building Materials,Montebello,90640,California,West,2009-01-04T00:00:00.000+0000,2,Development - Python,Development,Business Model,Sophia Dixon,Bob Turner,71,1,0.35
6225,44069,2009-01-02T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,43,4730,0.08,110,4312.18,Elizabeth Hansen,Fastenal Company,Consumer Services,RETAIL: Building Materials,Montebello,90640,California,West,2009-01-02T00:00:00.000+0000,0,Development - Python,Development,SQL,Mia Moore,Bob Turner,51,1,0.54
1074,7909,2009-01-03T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,29,3480,0.03,120,3345.1,Alex Grayson,C.H. Robinson Worldwide Inc.,Transportation,Oil Refining/Marketing,Lake Oswego,97035,Oregon,West,2009-01-04T00:00:00.000+0000,1,Development - Business Logic,Development,Market Research,Abigail Young,Bob Turner,50,1,0.58
1315,9637,2009-01-03T00:00:00.000+0000,2009-01-01T00:00:00.000+0000,12,1800,0.08,150,1641.04,Andy Willingham,DIRECTV,Consumer Services,Telecommunications Equipment,Baton Rouge,70802,Louisiana,South,2009-01-05T00:00:00.000+0000,2,Consulting - Business Model,Consulting,Java,Madison Hill,Frank Mitchell,58,1,0.61


In [3]:
# Get monthly sales totals
summary = data.select("OrderMonthYear", "SaleAmount").groupBy("OrderMonthYear").sum().orderBy("OrderMonthYear").toDF("OrderMonthYear","SaleAmount")

# Convert OrderMonthYear to integer type
results = summary.rdd.map(lambda r: (int(r.OrderMonthYear.replace('-','')), r.SaleAmount)).toDF(["OrderMonthYear","SaleAmount"])


In [4]:
# convenience for specifying schema
from pyspark.mllib.regression import LabeledPoint
 
data = results.select("OrderMonthYear", "SaleAmount")\
  .map(lambda r: LabeledPoint(r[1], [r[0]]))\
  .toDF()
  
display(data)