In [1]:
# Data Source: https://archive.ics.uci.edu/ml/datasets/Adult

df = sqlContext.read.load('adult.csv', format='csv', header=True, inferSchema=True)

In [2]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- maritial-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- category: string (nullable = true)



In [3]:
df.show()

+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+--------+
|age|       workclass|fnlwgt|   education|education-num|     maritial-status|       occupation| relationship|              race|   sex|capital-gain|capital-loss|hours-per-week|native-country|category|
+---+----------------+------+------------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+--------+
| 39|       State-gov| 77516|   Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|             White|  Male|        2174|           0|            40| United-States|   <=50K|
| 50|Self-emp-not-inc| 83311|   Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|             White|  Male|           0|           0|            13| United-States|   <=

In [4]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString
from pyspark.ml import Pipeline

In [5]:
# Combines a list of double input features into a vector
assembler = VectorAssembler(inputCols=["age", "education-num"], outputCol="features")

# String indexer converts a set of strings into doubles
indexer = StringIndexer(inputCol="category", outputCol="category-index").fit(df)

# Specify model
dt = DecisionTreeClassifier(labelCol = "category-index", featuresCol="features")

# Can be used to combine pipeline components together
pipeline = Pipeline(stages=[tokenizer, hashtf, dt])


NameError: name 'tokenizer' is not defined

In [6]:
indexer.labels

[u'<=50K', u'>50K']

In [7]:
df.count()

32561

In [8]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

In [9]:
# Train the model on the training data
model = pipeline.fit(trainingData)

# Run the model also on the training data (for evaluation purposes)
training = model.transform(trainingData)

# Run the model on the testing data for predictions
predictions = model.transform(testData)

In [10]:
predictions

DataFrame[age: int, workclass: string, fnlwgt: int, education: string, education-num: int, maritial-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, category: string, features: vector, category-index: double, rawPrediction: vector, probability: vector, prediction: double]

In [11]:
predictions.select('features', 'category', 'category-index', 'prediction').show()

+----------+--------+--------------+----------+
|  features|category|category-index|prediction|
+----------+--------+--------------+----------+
|[17.0,8.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,9.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,7.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,8.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|       0.0|
|[17.0,6.0]|   <=50K|           0.0|    

In [12]:
labels = list(model.stages[1].labels)
inverter = IndexToString(inputCol="prediction", outputCol="prediction-label", labels=labels)
inverter.transform(model.transform(testData)).select("prediction-label", "category").take(5)

[Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K'),
 Row(prediction-label=u'<=50K', category=u'<=50K')]

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="category-index", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
accuracy = evaluator.evaluate(training)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.205514
