<b>Dataset location: </b>https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

The data is in the format value1, value2... <br />
The leading whitespace for each value needs to be removed

In [1]:
rawData = sqlContext.read.csv('../datasets/adult.csv', 
                              ignoreLeadingWhiteSpace=True)
rawData.take(2)

[Row(_c0='39', _c1='State-gov', _c2='77516', _c3='Bachelors', _c4='13', _c5='Never-married', _c6='Adm-clerical', _c7='Not-in-family', _c8='White', _c9='Male', _c10='2174', _c11='0', _c12='40', _c13='United-States', _c14='<=50K'),
 Row(_c0='50', _c1='Self-emp-not-inc', _c2='83311', _c3='Bachelors', _c4='13', _c5='Married-civ-spouse', _c6='Exec-managerial', _c7='Husband', _c8='White', _c9='Male', _c10='0', _c11='0', _c12='13', _c13='United-States', _c14='<=50K')]

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Predicting whether a person\'s income is greater than $50K') \
    .getOrCreate()

rawData = spark.read\
            .format('csv')\
            .option('header', 'false')\
            .option('ignoreLeadingWhiteSpace', 'true')\
            .load('../datasets/adult.csv')

#### Specify column headers for data set

In [3]:
dataset = rawData.toDF('Age',
               'WorkClass',
               'FnlWgt',
               'Education',
               'EducationNum',
               'MaritalStatus',
               'Occupation',
               'Relationship',
               'Race',
               'Gender',
               'CapitalGain',
               'CapitalLoss',
               'HoursPerWeek',
               'NativeCountry',
               'Label'
                )

In [5]:
dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Drop FnlWgt column which does not appear meaningful

In [6]:
dataset = dataset.drop('FnlWgt')

#### Examine the dataset
* The FnlWgt column has been dropped
* There are missing values in the data represented by '?' (e.g. line 32541 for column WorkClass)

In [9]:
dataset.toPandas()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


#### Count rows in dataset

In [10]:
dataset.count()

32561

#### Convert missing values to null
Missing values in this dataset are represented by ?

In [11]:
dataset = dataset.replace('?', None)

#### Drop all rows which contain even a single missing value
The value 'any' for parameter how specifies that even a single missing value in a row should result in it being dropped (as opposed to 'all' where all values need to be missing)

In [14]:
dataset = dataset.dropna(how='any')

#### Number of rows has reduced now

In [16]:
dataset.count()

30162

#### Confirm missing value rows are not there
Row 32541 for example

In [15]:
dataset.toPandas()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


#### View the data types for all the columns
Since they have all been loaded as Strings, we need to convert the numeric fields to Float

In [19]:
dataset.describe()

DataFrame[summary: string, Age: string, WorkClass: string, Education: string, EducationNum: string, MaritalStatus: string, Occupation: string, Relationship: string, Race: string, Gender: string, CapitalGain: string, CapitalLoss: string, HoursPerWeek: string, NativeCountry: string, Label: string]

In [18]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

dataset = dataset.withColumn('Age', 
                             dataset['Age'].cast(FloatType()))
dataset = dataset.withColumn('EducationNum', 
                             dataset['EducationNum'].cast(FloatType()))
dataset = dataset.withColumn('CapitalGain', 
                             dataset['CapitalGain'].cast(FloatType()))
dataset = dataset.withColumn('CapitalLoss', 
                             dataset['CapitalLoss'].cast(FloatType()))
dataset = dataset.withColumn('HoursPerWeek', 
                             dataset['HoursPerWeek'].cast(FloatType()))

dataset.toPandas().head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


#### Transform categorical fields
First use StringIndexer to convert categorical values to indices

In [20]:
from pyspark.ml.feature import StringIndexer

indexedDF = StringIndexer(
    inputCol='WorkClass', outputCol='WorkClass_index').fit(df).transform(df)

#### A new column called WorkClass_index is created
This stores the indexed values of WorkClass

In [21]:
indexedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,4.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.0


#### OneHotEncoding
Use the new indexed field to obtain a one-hot-encoded field

In [22]:
from pyspark.ml.feature import OneHotEncoder

encodedDF = OneHotEncoder(
    inputCol="WorkClass_index", 
    outputCol="WorkClass_encoded").transform(indexedDF)

#### A WorkClass_encoded field is created 
* This contains the one-hot-encoding for WorkClass
* This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input

In [23]:
encodedDF.toPandas().head()

Unnamed: 0,Age,WorkClass,FnlWgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Label,WorkClass_index,WorkClass_encoded
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)"
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


#### View the original and transformed fields together

In [24]:
encodedDF.toPandas()[[
    'WorkClass', 'WorkClass_index', 'WorkClass_encoded']].head()

Unnamed: 0,WorkClass,WorkClass_index,WorkClass_encoded
0,State-gov,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)"
1,Self-emp-not-inc,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
2,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
3,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
4,Private,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"


### Transform the entire dataset
* So far we have only transformed a single column
* We need to perform this transformation for every categorical and non-numeric column
* This will be simplified by using a Pipeline (a feature of Spark ML)

####  First, split the data into training and test sets

In [25]:
(trainingData, testData) = dataset.randomSplit([0.8,0.2])

#### Encode all the categorical fields in the dataset
We begin by listing all the categorical fields

In [26]:
categoricalFeatures = [
               'WorkClass',
               'Education',
               'MaritalStatus',
               'Occupation',
               'Relationship',
               'Race',
               'Gender',
               'NativeCountry'
]

#### Create an array of StringIndexers to convert the categorical values to indices

In [27]:
indexers = [StringIndexer(
    inputCol=column, 
    outputCol=column + '_index', 
    handleInvalid='keep') for column in categoricalFeatures]

#### Create an array of OneHotEncoders to encode the categorical values

In [28]:
encoders = [OneHotEncoder(
    inputCol=column + '_index', 
    outputCol= column + '_encoded') for column in categoricalFeatures]

#### Index the Label field

In [29]:
labelIndexer = [StringIndexer(
    inputCol='Label', outputCol='Label_index')]

#### Create a pipeline
The pipeline contains the array of StringIndexers and OneHotEncoders

In [30]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers + encoders + labelIndexer)

#### View the result of the transformations performed by this pipeline
This pipeline can transform our dataset into a format which can be used by our model

In [31]:
transformedDF = pipeline.fit(trainingData).transform(trainingData)
transformedDF.toPandas().tail()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,NativeCountry_index,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index
24208,90.0,Private,Some-college,10.0,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0.0,...,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24209,90.0,Private,Some-college,10.0,Separated,Adm-clerical,Own-child,White,Female,0.0,...,4.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24210,90.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,10566.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24211,90.0,Self-emp-not-inc,HS-grad,9.0,Never-married,Exec-managerial,Not-in-family,White,Male,2964.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
24212,90.0,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


#### Select the required features
At this point the dataset contains a lot of additional columns. We select the features needed by our model

In [32]:
requiredFeatures = [
    'Age',
    'EducationNum',
    'CapitalGain',
    'CapitalLoss',
    'HoursPerWeek',
    'WorkClass_encoded',
    'Education_encoded',
    'MaritalStatus_encoded',
    'Occupation_encoded',
    'Relationship_encoded',
    'Race_encoded',
    'Gender_encoded',
    'NativeCountry_encoded'
]

#### VectorAssembler
VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector
* We had previously written our own function to create such a vector

In [33]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=requiredFeatures, outputCol='features')

In [34]:
transformedDF = assembler.transform(transformedDF)
transformedDF.toPandas().tail()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,WorkClass_encoded,Education_encoded,MaritalStatus_encoded,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features
24208,90.0,Private,Some-college,10.0,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 35.0, 1.0, 0.0, 0.0, 0...."
24209,90.0,Private,Some-college,10.0,Separated,Adm-clerical,Own-child,White,Female,0.0,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0...."
24210,90.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,10566.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 13.0, 10566.0, 0.0, 50.0, 0.0, 1.0, 0.0..."
24211,90.0,Self-emp-not-inc,HS-grad,9.0,Never-married,Exec-managerial,Not-in-family,White,Male,2964.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 9.0, 2964.0, 0.0, 12.0, 0.0, 1.0, 0.0, ..."
24212,90.0,Self-emp-not-inc,Some-college,10.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(90.0, 10.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0...."


In [37]:
transformedDF.select('features').toPandas().tail()

Unnamed: 0,features
24208,"(90.0, 10.0, 0.0, 0.0, 35.0, 1.0, 0.0, 0.0, 0...."
24209,"(90.0, 10.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0...."
24210,"(90.0, 13.0, 10566.0, 0.0, 50.0, 0.0, 1.0, 0.0..."
24211,"(90.0, 9.0, 2964.0, 0.0, 12.0, 0.0, 1.0, 0.0, ..."
24212,"(90.0, 10.0, 0.0, 0.0, 40.0, 0.0, 1.0, 0.0, 0...."


#### Specify our estimator

In [39]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='Label_index', 
                            featuresCol='features',
                            maxDepth=5)

#### Final Pipeline
* The pipeline we built previously only transformed the feature columns
* We re-create the pipeline to include the VectorAssembler and the estimator

The pipeline to be used to build the model contains all the transformers and ends with the estimator

In [40]:
pipeline = Pipeline(
    stages=indexers + encoders + labelIndexer + [assembler, rf]
)

#### Train the model

In [41]:
model = pipeline.fit(trainingData)

#### Use the test data for predictions

In [43]:
predictions = model.transform(testData)
predictionsDF = predictions.toPandas()
predictionsDF.head()

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
0,17.0,Local-gov,10th,6.0,Never-married,Protective-serv,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 1602.0, 40.0, 0.0, 0.0, 1.0, ...","[19.546779064877576, 0.4532209351224264]","[0.9773389532438786, 0.022661046756121316]",0.0
1,17.0,Local-gov,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 7.0, 0.0, 0.0, 15.0, 0.0, 0.0, 1.0, 0.0...","[19.571982942520286, 0.42801705747971697]","[0.9785991471260141, 0.021400852873985843]",0.0
2,17.0,Local-gov,9th,5.0,Never-married,Other-service,Own-child,Black,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 5.0, 0.0, 0.0, 9.0, 0.0, 0.0, 1.0, 0.0,...","[19.6405508548784, 0.3594491451216071]","[0.9820275427439196, 0.01797245725608035]",0.0
3,17.0,Private,10th,6.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 20.0, 1.0, 0.0, 0.0, 0.0...","[19.655545353445284, 0.3444546465547181]","[0.982777267672264, 0.017222732327735902]",0.0
4,17.0,Private,10th,6.0,Never-married,Handlers-cleaners,Other-relative,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",0.0,"(17.0, 6.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 0.0...","[19.255418126540178, 0.7445818734598221]","[0.9627709063270089, 0.037229093672991105]",0.0


#### Select the correct label and predictions to evaluate the model

In [44]:
predictions = predictions.select(
    col('Label_index'),
    col('prediction')
)

#### Create an evaluator for our model

In [45]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='Label_index', 
    predictionCol='prediction', 
    metricName='accuracy')

#### Check the accuracy

In [47]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.8234997478567827


#### Examine incorrect predictions

In [48]:
predictionsDF.loc[
    predictionsDF['Label_index'] != predictionsDF['prediction']
]

Unnamed: 0,Age,WorkClass,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,...,Occupation_encoded,Relationship_encoded,Race_encoded,Gender_encoded,NativeCountry_encoded,Label_index,features,rawPrediction,probability,prediction
432,21.0,Private,Assoc-acdm,12.0,Married-civ-spouse,Adm-clerical,Wife,Amer-Indian-Eskimo,Female,0.0,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(21.0, 12.0, 0.0, 0.0, 46.0, 1.0, 0.0, 0.0, 0....","[15.318326324749767, 4.681673675250232]","[0.7659163162374883, 0.2340836837625116]",0.0
584,22.0,Private,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 9.0, 0.0, 0.0, 50.0, 1.0, 0.0, 0.0, 0.0...","[13.475997775472218, 6.524002224527781]","[0.6737998887736109, 0.32620011122638903]",0.0
636,22.0,Private,Some-college,10.0,Never-married,Other-service,Own-child,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 10.0, 0.0, 0.0, 32.0, 1.0, 0.0, 0.0, 0....","[19.728409548695897, 0.2715904513041057]","[0.9864204774347947, 0.013579522565205282]",0.0
658,22.0,State-gov,12th,8.0,Never-married,Exec-managerial,Not-in-family,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(22.0, 8.0, 0.0, 0.0, 50.0, 0.0, 0.0, 0.0, 1.0...","[17.731107627271527, 2.268892372728471]","[0.8865553813635765, 0.11344461863642356]",0.0
677,23.0,Local-gov,Some-college,10.0,Never-married,Protective-serv,Own-child,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(23.0, 10.0, 0.0, 0.0, 40.0, 0.0, 0.0, 1.0, 0....","[19.565887730557492, 0.4341122694425129]","[0.9782943865278744, 0.021705613472125643]",0.0
702,23.0,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"(23.0, 13.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0....","[9.187600408105176, 10.81239959189482]","[0.4593800204052589, 0.5406199795947411]",1.0
738,23.0,Private,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Wife,White,Female,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(23.0, 9.0, 0.0, 0.0, 40.0, 1.0, 0.0, 0.0, 0.0...","[15.37435451334589, 4.625645486654108]","[0.7687177256672946, 0.23128227433270537]",0.0
785,23.0,Private,Some-college,10.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(23.0, 10.0, 0.0, 0.0, 60.0, 1.0, 0.0, 0.0, 0....","[13.159427811035085, 6.840572188964915]","[0.6579713905517542, 0.34202860944824576]",0.0
852,24.0,Local-gov,Some-college,10.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 10.0, 0.0, 0.0, 72.0, 0.0, 0.0, 1.0, 0....","[13.15615278801283, 6.8438472119871685]","[0.6578076394006416, 0.3421923605993584]",0.0
871,24.0,Private,Assoc-acdm,12.0,Separated,Craft-repair,Unmarried,Asian-Pac-Islander,Male,8614.0,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,"(24.0, 12.0, 8614.0, 0.0, 40.0, 1.0, 0.0, 0.0,...","[11.89771400180987, 8.102285998190132]","[0.5948857000904935, 0.4051142999095066]",0.0
