### Este trabalho busca identificar o melhor algoritmo dentre os seguintes:###  
**(LogisticRegression, DecisionTreeClassifier, RandomForestClassifier)** 

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('adult_logReg').getOrCreate()
dframe = spark.read.csv('adult.csv', inferSchema = True, header=True)
dframe.show(3)

+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt| education|educational-num|     marital-status|        occupation|  relationship|  race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516| Bachelors|             13|      Never-married|      Adm-clerical| Not-in-family| White|  Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311| Bachelors|             13| Married-civ-spouse|   Exec-managerial|       Husband| White|  Male|           0|           0|            13| United-States| <=50K|
| 38|          Private|215646|   HS-grad|              

In [3]:
cols = dframe.columns

In [4]:
dframe.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [6]:
label_stringIdx = StringIndexer(inputCol = 'income', outputCol = 'label')
stages += [label_stringIdx]

In [7]:
numericCols = ["age", "fnlwgt", "educational-num", "capital-gain", "capital-loss", "hours-per-week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [8]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dframe)
dframe = pipelineModel.transform(dframe)
selectedcols = ["label", "features"] + cols
dframe = dframe.select(selectedcols)
dframe.show(3)

+-----+--------------------+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+------+------------+------------+--------------+--------------+------+
|label|            features|age|        workclass|fnlwgt| education|educational-num|     marital-status|        occupation|  relationship|  race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+-----+--------------------+---+-----------------+------+----------+---------------+-------------------+------------------+--------------+------+------+------------+------------+--------------+--------------+------+
|  0.0|(100,[4,10,24,32,...| 39|        State-gov| 77516| Bachelors|             13|      Never-married|      Adm-clerical| Not-in-family| White|  Male|        2174|           0|            40| United-States| <=50K|
|  0.0|(100,[1,10,23,31,...| 50| Self-emp-not-inc| 83311| Bachelors|             13| Married-civ-spouse|   Exec-managerial|       Husban

In [9]:
display(dframe)

DataFrame[label: double, features: vector, age: int, workclass: string, fnlwgt: int, education: string, educational-num: int, marital-status: string, occupation: string, relationship: string, race: string, gender: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, income: string]

In [10]:
train, test = dframe.randomSplit([0.7, 0.3], seed=100)
print(train.count())
print(test.count())

22838
9723


### Regressão Logistica ( Logistic Regression )

In [11]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol = 'label', featuresCol = 'features', maxIter=10)
lrModel = lr.fit(train)

In [12]:
predictions = lrModel.transform(test)
predictions.take(1)

[Row(label=0.0, features=SparseVector(100, {0: 1.0, 8: 1.0, 23: 1.0, 29: 1.0, 43: 1.0, 48: 1.0, 52: 1.0, 53: 1.0, 94: 26.0, 95: 58426.0, 96: 9.0, 99: 50.0}), age=26, workclass=' Private', fnlwgt=58426, education=' HS-grad', educational-num=9, marital-status=' Married-civ-spouse', occupation=' Prof-specialty', relationship=' Husband', race=' White', gender=' Male', capital-gain=0, capital-loss=0, hours-per-week=50, native-country=' United-States', income=' <=50K', rawPrediction=DenseVector([0.8111, -0.8111]), probability=DenseVector([0.6923, 0.3077]), prediction=0.0)]

In [13]:
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [15]:
selected.show(4)

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       0.0|[0.69234532795194...| 26| Prof-specialty|
|  0.0|       0.0|[0.62115531452964...| 30| Prof-specialty|
|  0.0|       0.0|[0.65845294177529...| 31| Prof-specialty|
|  0.0|       0.0|[0.65826620022842...| 32| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 4 rows



In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Area Under ROC', evaluator.evaluate(predictions))

Area Under ROC 0.9014206228690932


In [17]:
evaluator.getMetricName()

'areaUnderROC'

In [18]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [19]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [20]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(train)

## Árvore de Decisão ( Decision Trees )

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)

In [22]:
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

numNodes =  11
depth =  3


In [23]:
predictions = dtModel.transform(test)
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.7608600693350143

In [26]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

In [27]:
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(train)

In [28]:
print("numNodes = ", cvModel.bestModel.numNodes)
print("depth = ", cvModel.bestModel.depth)

numNodes =  449
depth =  10


In [29]:
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

0.7691700713405724

In [30]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [31]:
selected.show(3)

+-----+----------+-----------+---+---------------+
|label|prediction|probability|age|     occupation|
+-----+----------+-----------+---+---------------+
|  0.0|       1.0|  [0.4,0.6]| 26| Prof-specialty|
|  0.0|       0.0|[0.75,0.25]| 30| Prof-specialty|
|  0.0|       0.0|[0.75,0.25]| 31| Prof-specialty|
+-----+----------+-----------+---+---------------+
only showing top 3 rows



### Floresta Aleatória ( Random Forest )

In [32]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [33]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [34]:
selected.show(3)

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       0.0|[0.73857356689059...| 26| Prof-specialty|
|  0.0|       0.0|[0.66308056728239...| 30| Prof-specialty|
|  0.0|       0.0|[0.66308056728239...| 31| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 3 rows



In [35]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.8879119846431538

In [36]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(train)

In [37]:
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

0.893426381686389

In [38]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [39]:
selected.show(3)

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       0.0|[0.67815793662316...| 26| Prof-specialty|
|  0.0|       0.0|[0.65536739741828...| 30| Prof-specialty|
|  0.0|       0.0|[0.65536739741828...| 31| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 3 rows



###                                 Conclusão  
Podemos  observar que  mesmo após os  ajustes feitos  em cada  
algoritmo, a acurácia não  produziu uma melhora significativa  
nos mesmos. Entretanto,  ao executar esse código percebemos que  
o algoritmo (Regression Logistic) performou melhor no processamento  
e na acurácia em relação aos demais algorítmos.