In [0]:
from pyspark.sql import SparkSession
import pandas as pd

# File location and type
file_location = "/FileStore/tables/Advertising.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df) #displaying the dataframe and its associated data
df.describe() #describing that data types of the columns

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
59.99,23,59761.56,226.74,Sharable client-driven software,Jamieberg,1,Norway,2016-05-19 14:30:17,0
88.91,33,53852.85,208.36,Enhanced dedicated support,Brandonstad,0,Myanmar,2016-01-28 20:59:32,0
66.0,48,24593.33,131.76,Reactive local challenge,Port Jefferybury,1,Australia,2016-03-07 01:40:15,1
74.53,30,68862.0,221.51,Configurable coherent function,West Colin,1,Grenada,2016-04-18 09:33:42,0
69.88,20,55642.32,183.82,Mandatory homogeneous architecture,Ramirezton,1,Ghana,2016-07-11 01:42:51,0


In [0]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

columns_to_drop = ['Country','Ad Topic Line','Timestamp','City'] #Dropping uneccessary columns.
df = df.drop(*columns_to_drop)

In [0]:
df.head()
display(df) #Checking new dataframe without the dropped columns

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
68.95,35,61833.9,256.09,0,0
80.23,31,68441.85,193.77,1,0
69.47,26,59785.94,236.5,0,0
74.15,29,54806.18,245.89,1,0
68.37,35,73889.99,225.58,0,0
59.99,23,59761.56,226.74,1,0
88.91,33,53852.85,208.36,0,0
66.0,48,24593.33,131.76,1,1
74.53,30,68862.0,221.51,1,0
69.88,20,55642.32,183.82,1,0


In [0]:
df = df.select('Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Clicked on Ad')
cols = df.columns
df.printSchema() #A schema is the description of the structure of the data

In [0]:
pd.DataFrame(df.take(5), columns=df.columns).transpose() #The transpose of a Dataframe is a new DataFrame whose rows are the columns of the original DataFrame

Unnamed: 0,0,1,2,3,4
Daily Time Spent on Site,68.95,80.23,69.47,74.15,68.37
Age,35.0,31.0,26.0,29.0,35.0
Area Income,61833.9,68441.85,59785.94,54806.18,73889.99
Daily Internet Usage,256.09,193.77,236.5,245.89,225.58
Clicked on Ad,0.0,0.0,0.0,0.0,0.0


In [0]:
#Summary statistics for numeric variables
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Age,1000,36.009,8.785562310125924,19,61
Clicked on Ad,1000,0.5,0.5002501876563867,0,1


In [0]:
display(df.select("Age", "Clicked on Ad"))

Age,Clicked on Ad
35,0
31,0
26,0
29,0
35,0
23,0
33,0
48,1
30,0
20,0


In [0]:
# Split our dataset between training and test datasets
(train, test) = df.randomSplit([0.8, 0.2], seed=12345)

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

va = VectorAssembler(inputCols = ["Age", "Daily Time Spent on Site", "Area Income"], outputCol = "features") #assembling the features
dt = DecisionTreeClassifier(labelCol = "Clicked on Ad", featuresCol = "features", seed = 54321, maxDepth = 5)
dt.setMaxBins(817)
pipeline = Pipeline(stages=[va, dt])
dt_model = pipeline.fit(train)


In [0]:
display(dt_model.stages[1])

treeNode
"{""index"":15,""featureType"":""continuous"",""prediction"":null,""threshold"":64.975,""categories"":null,""feature"":1,""overflow"":false}"
"{""index"":7,""featureType"":""continuous"",""prediction"":null,""threshold"":22.5,""categories"":null,""feature"":0,""overflow"":false}"
"{""index"":1,""featureType"":""continuous"",""prediction"":null,""threshold"":54.41,""categories"":null,""feature"":1,""overflow"":false}"
"{""index"":0,""featureType"":null,""prediction"":1.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":3,""featureType"":""continuous"",""prediction"":null,""threshold"":60.87,""categories"":null,""feature"":1,""overflow"":false}"
"{""index"":2,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":5,""featureType"":""continuous"",""prediction"":null,""threshold"":60.974999999999994,""categories"":null,""feature"":1,""overflow"":false}"
"{""index"":4,""featureType"":null,""prediction"":1.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":6,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":13,""featureType"":""continuous"",""prediction"":null,""threshold"":76772.4,""categories"":null,""feature"":2,""overflow"":false}"


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(test) # this update doesn't support

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
