
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/tips.csv"
file_type = "csv"

# Read CSV
df = spark.read.csv(file_location, header=True, inferSchema=True)

display(df)

total_bill,tip,sex,smoker,day,time,size
16.99,1.01,Female,No,Sun,Dinner,2
10.34,1.66,Male,No,Sun,Dinner,3
21.01,3.5,Male,No,Sun,Dinner,3
23.68,3.31,Male,No,Sun,Dinner,2
24.59,3.61,Female,No,Sun,Dinner,4
25.29,4.71,Male,No,Sun,Dinner,4
8.77,2.0,Male,No,Sun,Dinner,2
26.88,3.12,Male,No,Sun,Dinner,4
15.04,1.96,Male,No,Sun,Dinner,2
14.78,3.23,Male,No,Sun,Dinner,2


In [0]:
# Checking Schema
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [0]:
# Handling categorical features
from pyspark.ml.feature import StringIndexer
inputCols = ['sex', 'smoker', 'day', 'time']
outputCols = ['{}_indexed'.format(c) for c in inputCols]
indexer = StringIndexer(inputCols=inputCols, outputCols=outputCols)

df_indexed = indexer.fit(df).transform(df)

display(df_indexed)

total_bill,tip,sex,smoker,day,time,size,sex_indexed,smoker_indexed,day_indexed,time_indexed
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0


In [0]:
from pyspark.ml.feature import VectorAssembler

inputCols = ['tip', 'sex_indexed', 'smoker_indexed', 'day_indexed', 'time_indexed', 'size']
feature_assembler = VectorAssembler(inputCols=inputCols, outputCol='features')

df_indexed = feature_assembler.transform(df_indexed)

display(df_indexed)

total_bill,tip,sex,smoker,day,time,size,sex_indexed,smoker_indexed,day_indexed,time_indexed,features
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(1.01, 1.0, 0.0, 1.0, 0.0, 2.0))"
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(1.66, 0.0, 0.0, 1.0, 0.0, 3.0))"
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(3.5, 0.0, 0.0, 1.0, 0.0, 3.0))"
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(3.31, 0.0, 0.0, 1.0, 0.0, 2.0))"
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(3.61, 1.0, 0.0, 1.0, 0.0, 4.0))"
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(4.71, 0.0, 0.0, 1.0, 0.0, 4.0))"
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(2.0, 0.0, 0.0, 1.0, 0.0, 2.0))"
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(3.12, 0.0, 0.0, 1.0, 0.0, 4.0))"
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(1.96, 0.0, 0.0, 1.0, 0.0, 2.0))"
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(3.23, 0.0, 0.0, 1.0, 0.0, 2.0))"


In [0]:
df_indexed = df_indexed.select(['features', 'total_bill'])
display(df_indexed)

features,total_bill
"Map(vectorType -> dense, length -> 6, values -> List(1.01, 1.0, 0.0, 1.0, 0.0, 2.0))",16.99
"Map(vectorType -> dense, length -> 6, values -> List(1.66, 0.0, 0.0, 1.0, 0.0, 3.0))",10.34
"Map(vectorType -> dense, length -> 6, values -> List(3.5, 0.0, 0.0, 1.0, 0.0, 3.0))",21.01
"Map(vectorType -> dense, length -> 6, values -> List(3.31, 0.0, 0.0, 1.0, 0.0, 2.0))",23.68
"Map(vectorType -> dense, length -> 6, values -> List(3.61, 1.0, 0.0, 1.0, 0.0, 4.0))",24.59
"Map(vectorType -> dense, length -> 6, values -> List(4.71, 0.0, 0.0, 1.0, 0.0, 4.0))",25.29
"Map(vectorType -> dense, length -> 6, values -> List(2.0, 0.0, 0.0, 1.0, 0.0, 2.0))",8.77
"Map(vectorType -> dense, length -> 6, values -> List(3.12, 0.0, 0.0, 1.0, 0.0, 4.0))",26.88
"Map(vectorType -> dense, length -> 6, values -> List(1.96, 0.0, 0.0, 1.0, 0.0, 2.0))",15.04
"Map(vectorType -> dense, length -> 6, values -> List(3.23, 0.0, 0.0, 1.0, 0.0, 2.0))",14.78


In [0]:
from pyspark.ml.regression import LinearRegression
# train/test split
train_data, test_data = df_indexed.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='features', labelCol='total_bill')
regressor = regressor.fit(df_indexed)

In [0]:
# Check coefficients
print(regressor.coefficients)
# Check intercept
print(regressor.intercept)

[3.1040236261853535,-1.0320642528968365,2.3573303460833297,-0.25002164959082745,-0.9833660413554288,3.435299406711025]
1.6585163529540743


In [0]:
# Predict
predictions = regressor.evaluate(test_data)

In [0]:
predictions.predictions.show()

+--------------------+----------+------------------+
|            features|total_bill|        prediction|
+--------------------+----------+------------------+
| (6,[0,5],[2.0,2.0])|     13.37| 14.73716241874683|
|(6,[0,5],[2.01,2.0])|     20.23|14.768202655008682|
|(6,[0,5],[2.34,4.0])|     17.81|22.663129265071902|
| (6,[0,5],[2.5,4.0])|     18.35| 23.15977304526156|
| (6,[0,5],[3.0,4.0])|     20.45|24.711784858354235|
|(6,[0,5],[3.18,2.0])|     19.82|18.399910297645548|
|(6,[0,5],[3.76,2.0])|     18.24|20.200244000833052|
|(6,[0,5],[4.08,2.0])|     17.92|21.193531561212367|
| (6,[0,5],[5.0,3.0])|     31.27|27.484532704013915|
|[1.25,1.0,0.0,2.0...|      8.51| 9.893671105673896|
|[1.32,0.0,0.0,1.0...|      9.68|12.376404703349962|
|[1.5,0.0,1.0,3.0,...|     12.03|14.792416002965002|
|[1.56,0.0,0.0,1.0...|      9.94|13.121370373634448|
|[1.68,1.0,0.0,2.0...|     13.42|11.228401264933597|
|[1.73,0.0,0.0,2.0...|      9.78|12.415666699139702|
|[1.98,0.0,1.0,0.0...|     11.02|17.0324122923

In [0]:
# Performance Metrics
predictions.r2, predictions.meanAbsoluteError, predictions.meanSquaredError

(0.41848207919802727, 4.198161596277415, 30.71130423889035)

In [0]:
# Saving Model
filename = '/FileStore/tables/regressor.model'
regressor.save(filename)

In [0]:
# Loading Model
from pyspark.ml.regression import LinearRegressionModel

loaded_model = LinearRegressionModel.load(filename)

In [0]:
predictions_of_loaded_model = loaded_model.evaluate(test_data)

In [0]:
predictions_of_loaded_model.predictions.show()

+--------------------+----------+------------------+
|            features|total_bill|        prediction|
+--------------------+----------+------------------+
| (6,[0,5],[2.0,2.0])|     13.37| 14.73716241874683|
|(6,[0,5],[2.01,2.0])|     20.23|14.768202655008682|
|(6,[0,5],[2.34,4.0])|     17.81|22.663129265071902|
| (6,[0,5],[2.5,4.0])|     18.35| 23.15977304526156|
| (6,[0,5],[3.0,4.0])|     20.45|24.711784858354235|
|(6,[0,5],[3.18,2.0])|     19.82|18.399910297645548|
|(6,[0,5],[3.76,2.0])|     18.24|20.200244000833052|
|(6,[0,5],[4.08,2.0])|     17.92|21.193531561212367|
| (6,[0,5],[5.0,3.0])|     31.27|27.484532704013915|
|[1.25,1.0,0.0,2.0...|      8.51| 9.893671105673896|
|[1.32,0.0,0.0,1.0...|      9.68|12.376404703349962|
|[1.5,0.0,1.0,3.0,...|     12.03|14.792416002965002|
|[1.56,0.0,0.0,1.0...|      9.94|13.121370373634448|
|[1.68,1.0,0.0,2.0...|     13.42|11.228401264933597|
|[1.73,0.0,0.0,2.0...|      9.78|12.415666699139702|
|[1.98,0.0,1.0,0.0...|     11.02|17.0324122923