# Leitura do arquivo e seleção de colunas

In [1]:
df = (spark.read.format("csv").option('header', 'true').load("arquivos/titanic.csv"))

from pyspark.sql.functions import col
dataset = df.select(col('Survived').cast('float'),
                         col('Pclass').cast('float'),
                         col('Sex'),
                         col('Age').cast('float'),
                         col('Fare').cast('float'),
                         col('Embarked')
                        )

dataset = dataset.replace('null', None).dropna(how='any')

# Indexando colunas string 

In [2]:
from pyspark.ml.feature import StringIndexer
dataset = StringIndexer(inputCol='Sex', outputCol='Gender',handleInvalid='keep').fit(dataset).transform(dataset)
dataset = StringIndexer(inputCol='Embarked',outputCol='Boarded',handleInvalid='keep').fit(dataset).transform(dataset)
dataset = dataset.drop('Sex','Embarked')
dataset.show(3)

+--------+------+----+-------+------+-------+
|Survived|Pclass| Age|   Fare|Gender|Boarded|
+--------+------+----+-------+------+-------+
|     0.0|   3.0|22.0|   7.25|   0.0|    0.0|
|     1.0|   1.0|38.0|71.2833|   1.0|    1.0|
|     1.0|   3.0|26.0|  7.925|   1.0|    0.0|
+--------+------+----+-------+------+-------+
only showing top 3 rows



# Criação do vetor de features

In [3]:
required_features = ['Pclass','Age','Fare','Gender','Boarded']

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

transformed_data.show(3,False)

+--------+------+----+-------+------+-------+------------------------------------+
|Survived|Pclass|Age |Fare   |Gender|Boarded|features                            |
+--------+------+----+-------+------+-------+------------------------------------+
|0.0     |3.0   |22.0|7.25   |0.0   |0.0    |[3.0,22.0,7.25,0.0,0.0]             |
|1.0     |1.0   |38.0|71.2833|1.0   |1.0    |[1.0,38.0,71.2833023071289,1.0,1.0] |
|1.0     |3.0   |26.0|7.925  |1.0   |0.0    |[3.0,26.0,7.925000190734863,1.0,0.0]|
+--------+------+----+-------+------+-------+------------------------------------+
only showing top 3 rows



# Split de dados 80:20

In [4]:
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

# Fit do modelo RandomForestClassifier

In [7]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='Survived',featuresCol='features',maxDepth=3)

model = rf.fit(training_data)

# Predição dos dados de teste

In [8]:
predictions = model.transform(test_data)

# Evaluation (accuracy)

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived',predictionCol='prediction',metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.8102189781021898
