In [1]:
#spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName ("Titanic Logistic Regression").getOrCreate ()

In [2]:
#loading the dataset
data = spark.read.csv ("titanic.csv", inferSchema=True, header=True)

data = data.select (['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
data.count ()

891

In [3]:
#dropping missing values
data = data.na.drop ()
data.count ()

712

In [4]:
#handling categorical values
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorIndexer, VectorAssembler

gender_indexer = StringIndexer (inputCol="Sex", outputCol="SexIndex")
embarked_indexer = StringIndexer (inputCol="Embarked", outputCol="EmbarkedIndex")

encoder = OneHotEncoderEstimator (inputCols=["SexIndex", "EmbarkedIndex"], outputCols=["SexVec", "EmbarkedVec"])

In [5]:
#create assemble for machine learning
assembler = VectorAssembler (inputCols=["Pclass", "SexVec", "EmbarkedVec", "Age", "SibSp", "Parch", "Fare"], outputCol="features")

In [23]:
#without pipelines
from pyspark.ml.classification import LogisticRegression

data1 = gender_indexer.fit (data).transform (data)
data1 = embarked_indexer.fit (data1).transform (data1)
data1 = encoder.fit (data1).transform (data1)
data1 = assembler.transform (data1)
data1.printSchema ()

#train-test spliting
(train1, test1) = data1.randomSplit ([0.7, 0.3])

log_reg = LogisticRegression (featuresCol="features", labelCol="Survived")

log_reg = log_reg.fit (train1)
prediction = log_reg.transform (test1)
prediction.printSchema ()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkedVec: vector (nullable = true)
 |-- features: vector (nullable = true)

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkedVec: vector (nullable = true)
 |-- feat

In [24]:
#create pipeline and fit the model to data
from pyspark.ml import Pipeline

log_reg = LogisticRegression (featuresCol="features", labelCol="Survived")

(train, test) = data.randomSplit ([0.7, 0.3])

pipeline = Pipeline (stages=[gender_indexer, embarked_indexer, encoder, assembler, log_reg])
model = pipeline.fit (train)

In [25]:
#predicting results
prediction = model.transform (test)
prediction.printSchema ()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- SexVec: vector (nullable = true)
 |-- EmbarkedVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [26]:
prediction.select ("Survived", "prediction").show ()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
+--------+----------+
only showing top 20 rows



In [27]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator (rawPredictionCol="prediction", labelCol="Survived")

area_under_curve = evaluator.evaluate (prediction)
print (area_under_curve)

0.75
