In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [3]:
data = data.select(['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare'])

In [4]:
from pyspark.ml.feature import Imputer
imputer = Imputer(strategy='mean', inputCols=['Age'], outputCols=['AgeImputed'])
imputer_model = imputer.fit(data)
data = imputer_model.transform(data)

In [5]:
from pyspark.ml.feature import StringIndexer
gender_indexer = StringIndexer(inputCol='Gender', outputCol='GenderIndexed')
gender_indexer_model = gender_indexer.fit(data)
data = gender_indexer_model.transform(data)


In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Parch', 'Fare', 'AgeImputed', 'GenderIndexed'], outputCol='features')
data = assembler.transform(data)


In [7]:
from pyspark.ml.classification import RandomForestClassifier
algo = RandomForestClassifier(featuresCol='features', labelCol='Survived')
model = algo.fit(data)


In [8]:
predictions = model.transform(data)


In [9]:
predictions.select(['Survived','prediction', 'probability']).show()


+--------+----------+--------------------+
|Survived|prediction|         probability|
+--------+----------+--------------------+
|       0|       0.0|[0.89516011407504...|
|       1|       1.0|[0.02629659675228...|
|       1|       1.0|[0.35888624544936...|
|       1|       1.0|[0.02725813521381...|
|       0|       0.0|[0.88667880367498...|
|       0|       0.0|[0.88667880367498...|
|       0|       0.0|[0.69352672925803...|
|       0|       0.0|[0.78859753881588...|
|       1|       1.0|[0.41908778162468...|
|       1|       1.0|[0.09387279224457...|
|       1|       1.0|[0.31661653809006...|
|       1|       1.0|[0.06019703990179...|
|       0|       0.0|[0.88667880367498...|
|       0|       0.0|[0.84420509784497...|
|       0|       1.0|[0.35909429272511...|
|       1|       1.0|[0.13143343428667...|
|       0|       0.0|[0.84152546674381...|
|       1|       0.0|[0.87601189505102...|
|       0|       0.0|[0.52841529862740...|
|       1|       1.0|[0.32088816228999...|
+--------+-

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Survived', metricName='areaUnderROC')

In [11]:
evaluator.evaluate(predictions)


0.8916717263711798

In [12]:
y_true = predictions.select(['Survived']).collect()
y_pred = predictions.select(['prediction']).collect()

In [13]:
from sklearn.metrics import classification_report, confusion_matrix


In [14]:
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.93      0.89       549
           1       0.87      0.72      0.79       342

    accuracy                           0.85       891
   macro avg       0.86      0.83      0.84       891
weighted avg       0.85      0.85      0.85       891



In [15]:
print(confusion_matrix(y_true, y_pred))


[[513  36]
 [ 97 245]]
