In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=150c9510048e2f70916202bf8527313b66627bc87177efec33f2d52edafa7570
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
!wget https://archive.ics.uci.edu/static/public/2/adult.zip

--2024-04-11 11:04:43--  https://archive.ics.uci.edu/static/public/2/adult.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘adult.zip’

adult.zip               [  <=>               ] 605.70K  1.86MB/s    in 0.3s    

2024-04-11 11:04:44 (1.86 MB/s) - ‘adult.zip’ saved [620237]



In [None]:
!unzip -q adult.zip

## Import Libraries

In [None]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

## Creat SparkSession

In [None]:
spark = SparkSession.builder.appName('Decision Trees UCI').getOrCreate()

## Read The Dataset

In [None]:
df = spark.read.csv('adult.data', header='False', inferSchema='True')

df.show(5)

+---+-----------------+--------+----------+----+-------------------+------------------+--------------+------+-------+------+----+----+--------------+------+
|_c0|              _c1|     _c2|       _c3| _c4|                _c5|               _c6|           _c7|   _c8|    _c9|  _c10|_c11|_c12|          _c13|  _c14|
+---+-----------------+--------+----------+----+-------------------+------------------+--------------+------+-------+------+----+----+--------------+------+
| 39|        State-gov| 77516.0| Bachelors|13.0|      Never-married|      Adm-clerical| Not-in-family| White|   Male|2174.0| 0.0|40.0| United-States| <=50K|
| 50| Self-emp-not-inc| 83311.0| Bachelors|13.0| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|   0.0| 0.0|13.0| United-States| <=50K|
| 38|          Private|215646.0|   HS-grad| 9.0|           Divorced| Handlers-cleaners| Not-in-family| White|   Male|   0.0| 0.0|40.0| United-States| <=50K|
| 53|          Private|234721.0|      11th| 7.0| Married-c

In [None]:
print('Number of data points: ', df.count())

Number of data points:  32561


In [None]:
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

df = df.toDF(*column_names)

df.show(5)

+---+-----------------+--------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|  fnlwgt| education|education-num|     marital-status|        occupation|  relationship|  race|    sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+-----------------+--------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516.0| Bachelors|         13.0|      Never-married|      Adm-clerical| Not-in-family| White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
| 50| Self-emp-not-inc| 83311.0| Bachelors|         13.0| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|         0.0|         0.0|          13.0| United-States| <=50K|
| 38|          Private|215646.0|   HS-grad|       

## Convert String Features to Integer Features

In [None]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [None]:
from pyspark.sql.types import StringType

all_cols = df.schema.fields

string_cols = [col.name for col in all_cols if col.dataType == StringType()]

print(string_cols)

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=col, outputCol=col+'_index')
    for col in string_cols
]

pipeline = Pipeline(stages=indexers)

pipeline_model = pipeline.fit(df)

In [None]:
df = pipeline_model.transform(df)

df.show(5)

+---+-----------------+--------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+---------------+---------------+--------------------+----------------+------------------+----------+---------+--------------------+------------+
|age|        workclass|  fnlwgt| education|education-num|     marital-status|        occupation|  relationship|  race|    sex|capital-gain|capital-loss|hours-per-week|native-country|income|workclass_index|education_index|marital-status_index|occupation_index|relationship_index|race_index|sex_index|native-country_index|income_index|
+---+-----------------+--------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+---------------+---------------+--------------------+----------------+------------------+----------+---------+--------------------+------------

In [None]:
columns = [col if col not in string_cols else col+'_index' for col in column_names]

print(columns)

['age', 'workclass_index', 'fnlwgt', 'education_index', 'education-num', 'marital-status_index', 'occupation_index', 'relationship_index', 'race_index', 'sex_index', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country_index', 'income_index']


In [None]:
df = df.select(*columns)

df.show(5)

+---+---------------+--------+---------------+-------------+--------------------+----------------+------------------+----------+---------+------------+------------+--------------+--------------------+------------+
|age|workclass_index|  fnlwgt|education_index|education-num|marital-status_index|occupation_index|relationship_index|race_index|sex_index|capital-gain|capital-loss|hours-per-week|native-country_index|income_index|
+---+---------------+--------+---------------+-------------+--------------------+----------------+------------------+----------+---------+------------+------------+--------------+--------------------+------------+
| 39|            4.0| 77516.0|            2.0|         13.0|                 1.0|             3.0|               1.0|       0.0|      0.0|      2174.0|         0.0|          40.0|                 0.0|         0.0|
| 50|            1.0| 83311.0|            2.0|         13.0|                 0.0|             2.0|               0.0|       0.0|      0.0|      

## Create a single Feature Vector

In [None]:
from pyspark.ml.feature import VectorAssembler

input_cols = columns[:-1]

vector_assembler = VectorAssembler(inputCols=input_cols, outputCol='featureVector')
df = vector_assembler.transform(df)

df.select('featureVector').show(5)

+--------------------+
|       featureVector|
+--------------------+
|[39.0,4.0,77516.0...|
|(14,[0,1,2,3,4,6,...|
|(14,[0,2,4,5,6,7,...|
|(14,[0,2,3,4,6,8,...|
|[28.0,0.0,338409....|
+--------------------+
only showing top 5 rows



## Split to Training and Testing

In [None]:
train_df, test_df = df.randomSplit([0.8, 0.2])

## Decision Trees

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(
    labelCol='income_index',
    featuresCol='featureVector',
    predictionCol='prediction',
    maxBins=50
)

model = classifier.fit(train_df)

In [None]:
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b69c48c4a496, depth=5, numNodes=31, numClasses=2, numFeatures=14
  If (feature 5 in {1.0,2.0,3.0,4.0,5.0,6.0})
   If (feature 10 <= 7073.5)
    If (feature 4 <= 12.5)
     If (feature 11 <= 2100.0)
      Predict: 0.0
     Else (feature 11 > 2100.0)
      If (feature 9 in {1.0})
       Predict: 0.0
      Else (feature 9 not in {1.0})
       Predict: 1.0
    Else (feature 4 > 12.5)
     Predict: 0.0
   Else (feature 10 > 7073.5)
    If (feature 0 <= 20.5)
     Predict: 0.0
    Else (feature 0 > 20.5)
     If (feature 6 in {9.0})
      Predict: 0.0
     Else (feature 6 not in {9.0})
      Predict: 1.0
  Else (feature 5 not in {1.0,2.0,3.0,4.0,5.0,6.0})
   If (feature 4 <= 12.5)
    If (feature 10 <= 7073.5)
     If (feature 3 in {5.0,7.0,8.0,10.0,11.0,13.0,14.0,15.0})
      Predict: 0.0
     Else (feature 3 not in {5.0,7.0,8.0,10.0,11.0,13.0,14.0,15.0})
      If (feature 11 <= 1740.5)
       Predict: 0.0
      Else (feature 11 > 

In [None]:
import pandas as pd

pd.DataFrame(
    model.featureImportances.toArray(),
    index=input_cols,
    columns=['importance']
).sort_values(by='importance', ascending=False)

Unnamed: 0,importance
marital-status_index,0.493159
education-num,0.225426
capital-gain,0.213097
education_index,0.032795
capital-loss,0.031665
age,0.001484
sex_index,0.001344
occupation_index,0.000588
workclass_index,0.000442
fnlwgt,0.0


In [None]:
predictions = model.transform(test_df)
predictions.select('income_index', 'prediction', 'probability').show(5)

+------------+----------+--------------------+
|income_index|prediction|         probability|
+------------+----------+--------------------+
|         0.0|       0.0|[0.97700009019572...|
|         0.0|       0.0|[0.97700009019572...|
|         0.0|       0.0|[0.97700009019572...|
|         0.0|       0.0|[0.97700009019572...|
|         0.0|       0.0|[0.97700009019572...|
+------------+----------+--------------------+
only showing top 5 rows



### Evaluate Outputs

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='income_index',
    predictionCol='prediction'
)

acc = evaluator.setMetricName('accuracy').evaluate(predictions)
f1 = evaluator.setMetricName('f1').evaluate(predictions)

print('Accuracy: ', round(acc * 100, 2))
print('F1 Score: ', round(f1 * 100, 2))

Accuracy:  84.38
F1 Score:  83.38


### Hyperparameter Tuning

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

classifier = DecisionTreeClassifier(
    labelCol='income_index',
    featuresCol='featureVector',
    predictionCol='prediction',
)

paramGrid = ParamGridBuilder().\
    addGrid(classifier.impurity, ['gini', 'entropy']).\
    addGrid(classifier.maxDepth, [1, 5, 10, 20]).\
    addGrid(classifier.maxBins, [50, 100, 200]).\
    addGrid(classifier.minInfoGain, [0.0, 0.05, 0.1, 0.5]).\
    build()

evaluator = MulticlassClassificationEvaluator(
    labelCol='income_index',
    predictionCol='prediction',
    metricName='accuracy'
)

In [None]:
from pyspark.ml.tuning import TrainValidationSplit

validator = TrainValidationSplit(
    estimator=classifier,
    evaluator=evaluator,
    estimatorParamMaps=paramGrid
)

validator_model = validator.fit(train_df)

In [None]:
best_model = validator_model.bestModel

print('Best Parameters: ')
print('\tImpurity: ', best_model.getImpurity())
print('\tMax Depth: ', best_model.getMaxDepth())
print('\tMax Bins: ', best_model.getMaxBins())
print('\tMin Info Gain: ', best_model.getMinInfoGain())

Best Parameters: 
	Impurity:  gini
	Max Depth:  10
	Max Bins:  100
	Min Info Gain:  0.0


In [None]:
predictions = best_model.transform(test_df)
predictions.select('income_index', 'prediction', 'probability').show(5)

+------------+----------+--------------------+
|income_index|prediction|         probability|
+------------+----------+--------------------+
|         0.0|       0.0|           [1.0,0.0]|
|         0.0|       0.0|[0.99834539813857...|
|         0.0|       0.0|[0.99834539813857...|
|         0.0|       0.0|[0.99834539813857...|
|         0.0|       0.0|[0.99834539813857...|
+------------+----------+--------------------+
only showing top 5 rows



In [None]:
acc = evaluator.setMetricName('accuracy').evaluate(predictions)
f1 = evaluator.setMetricName('f1').evaluate(predictions)

print('Accuracy: ', round(acc * 100, 2))
print('F1 Score: ', round(f1 * 100, 2))

Accuracy:  85.39
F1 Score:  85.04


## Random Forests

In [None]:
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(
    labelCol='income_index',
    featuresCol='featureVector',
    predictionCol='prediction',
    maxBins=50
)

model = classifier.fit(train_df)

In [None]:
pd.DataFrame(
    model.featureImportances.toArray(),
    index=input_cols,
    columns=['importance']
).sort_values(by='importance', ascending=False)

Unnamed: 0,importance
relationship_index,0.24277
capital-gain,0.223732
marital-status_index,0.220985
education-num,0.117237
occupation_index,0.069872
age,0.044498
capital-loss,0.032781
education_index,0.016782
hours-per-week,0.015819
sex_index,0.010804


In [None]:
predictions = model.transform(test_df)
predictions.select('income_index', 'prediction', 'probability').show(5)

+------------+----------+--------------------+
|income_index|prediction|         probability|
+------------+----------+--------------------+
|         0.0|       0.0|[0.96777520731335...|
|         0.0|       0.0|[0.96556166484809...|
|         0.0|       0.0|[0.96556166484809...|
|         0.0|       0.0|[0.96556166484809...|
|         0.0|       0.0|[0.96556166484809...|
+------------+----------+--------------------+
only showing top 5 rows



### Evaluation

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='income_index',
    predictionCol='prediction'
)

acc = evaluator.setMetricName('accuracy').evaluate(predictions)
f1 = evaluator.setMetricName('f1').evaluate(predictions)

print('Accuracy: ', round(acc * 100, 2))
print('F1 Score: ', round(f1 * 100, 2))

Accuracy:  84.59
F1 Score:  83.18


### HyperParameter Tuning

In [None]:
classifier = RandomForestClassifier(
    labelCol='income_index',
    featuresCol='featureVector',
    predictionCol='prediction',
)

paramGrid = ParamGridBuilder().\
    addGrid(classifier.impurity, ['gini', 'entropy']).\
    addGrid(classifier.maxDepth, [1, 5, 10, 20]).\
    addGrid(classifier.maxBins, [50, 100, 200]).\
    addGrid(classifier.numTrees, [20, 50, 100]).\
    addGrid(classifier.minInfoGain, [0.0, 0.05, 0.1, 0.5]).\
    build()

evaluator = MulticlassClassificationEvaluator(
    labelCol='income_index',
    predictionCol='prediction',
    metricName='accuracy'
)

In [None]:
validator = TrainValidationSplit(
    estimator=classifier,
    evaluator=evaluator,
    estimatorParamMaps=paramGrid
)

validator_model = validator.fit(train_df)

In [None]:
best_model = validator_model.bestModel

print('Best Parameters: ')
print('\tImpurity: ', best_model.getImpurity())
print('\tMax Depth: ', best_model.getMaxDepth())
print('\tMax Bins: ', best_model.getMaxBins())
print('\tNum Trees: ', best_model.getNumTrees)
print('\tMin Info Gain: ', best_model.getMinInfoGain())

Best Parameters: 
	Impurity:  gini
	Max Depth:  20
	Max Bins:  200
	Num Trees:  100
	Min Info Gain:  0.0


In [None]:
predictions = best_model.transform(test_df)
predictions.select('income_index', 'prediction', 'probability').show(5)

+------------+----------+--------------------+
|income_index|prediction|         probability|
+------------+----------+--------------------+
|         0.0|       0.0|[0.96898822914188...|
|         0.0|       0.0|[0.99993648554836...|
|         0.0|       0.0|[0.99991175294138...|
|         0.0|       0.0|[0.99990319065643...|
|         0.0|       0.0|[0.99990018790770...|
+------------+----------+--------------------+
only showing top 5 rows



In [None]:
acc = evaluator.setMetricName('accuracy').evaluate(predictions)
f1 = evaluator.setMetricName('f1').evaluate(predictions)

print('Accuracy: ', round(acc * 100, 2))
print('F1 Score: ', round(f1 * 100, 2))

Accuracy:  86.66
F1 Score:  86.13
