In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv('../data/titanic.csv', inferSchema=True, header=True)

In [4]:
data = data.select(['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare'])

In [5]:
from pyspark.ml.feature import Imputer
imputer = Imputer(strategy='mean', inputCols=['Age'], outputCols=['AgeImputed'])
imputer_model = imputer.fit(data)
data = imputer_model.transform(data)

In [6]:
from pyspark.ml.feature import StringIndexer
gender_indexer = StringIndexer(inputCol='Gender', outputCol='GenderIndexed')
gender_indexer_model = gender_indexer.fit(data)
data = gender_indexer_model.transform(data)


In [7]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Parch', 'Fare', 'AgeImputed', 'GenderIndexed'], outputCol='features')
data = assembler.transform(data)


In [8]:
from pyspark.ml.classification import RandomForestClassifier
algo = RandomForestClassifier(featuresCol='features', labelCol='Survived')
model = algo.fit(data)


In [9]:
predictions = model.transform(data)


In [10]:
predictions.select(['Survived','prediction', 'probability']).show()


+--------+----------+--------------------+
|Survived|prediction|         probability|
+--------+----------+--------------------+
|       0|       0.0|[0.90333751780077...|
|       1|       1.0|[0.04997447519435...|
|       1|       1.0|[0.39005686970077...|
|       1|       1.0|[0.05711733233721...|
|       0|       0.0|[0.89554039794208...|
|       0|       0.0|[0.89399617279588...|
|       0|       0.0|[0.72310976694332...|
|       0|       0.0|[0.79435580534121...|
|       1|       1.0|[0.39841112301479...|
|       1|       1.0|[0.12417054023073...|
|       1|       1.0|[0.34283406765706...|
|       1|       1.0|[0.12804789229216...|
|       0|       0.0|[0.89294457093724...|
|       0|       0.0|[0.80536032807842...|
|       0|       1.0|[0.32997886229495...|
|       1|       1.0|[0.24335386510225...|
|       0|       0.0|[0.83763643775228...|
|       1|       0.0|[0.87314522473429...|
|       0|       1.0|[0.42846227415375...|
|       1|       1.0|[0.30282274792971...|
+--------+-

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Survived', metricName='areaUnderROC')

In [12]:
evaluator.evaluate(predictions)


0.8961908414022306

In [13]:
y_true = predictions.select(['Survived']).collect()
y_pred = predictions.select(['prediction']).collect()

In [15]:
from sklearn.metrics import classification_report, confusion_matrix


In [16]:
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.91      0.87       549
           1       0.84      0.72      0.77       342

    accuracy                           0.84       891
   macro avg       0.84      0.82      0.82       891
weighted avg       0.84      0.84      0.84       891



In [17]:
print(confusion_matrix(y_true, y_pred))


[[501  48]
 [ 96 246]]
