# Logistic regression - Titanic dataset

In [1]:
import findspark
findspark.init("/home/rodolfo/spark-3.3.1-bin-hadoop3")
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("log_reg").getOrCreate()

22/12/03 15:49:24 WARN Utils: Your hostname, rodolfo-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 192.168.15.11 instead (on interface wlp3s0)
22/12/03 15:49:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/03 15:49:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/03 15:49:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df = spark.read.csv("../../data/titanic.csv", header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
df_selected = df.select(["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"])

In [10]:
df_cleaned = df_selected.na.drop()

In [11]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [12]:
gender_indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol="SexVec")

embark_indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkIndex")
embark_encoder = OneHotEncoder(inputCol="EmbarkIndex", outputCol="EmbarkVec")

In [23]:
assembler = VectorAssembler(
    inputCols=["Pclass", "SexVec", "EmbarkVec", "Age", "SibSp", "Parch", "Fare"],
    outputCol="features"
)

In [24]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [25]:
log_reg = LogisticRegression(featuresCol="features", labelCol="Survived")

In [26]:
pipeline = Pipeline(
    stages=[
        gender_indexer,
        gender_encoder,
        embark_indexer,
        embark_encoder,
        assembler,
        log_reg
    ]
)

In [27]:
train, test = df_cleaned.randomSplit([0.7, 0.3])

In [28]:
log_reg_model = pipeline.fit(train)

In [29]:
results = log_reg_model.transform(test)

In [30]:
results.show(5)

+--------+------+------+----+-----+-----+--------+--------+--------+-------------+-----------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|SexIndex|       SexVec|EmbarkIndex|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+--------+--------+--------+-------------+-----------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|50.0|    0|    0| 28.7125|       C|     1.0|    (1,[],[])|        1.0|(2,[1],[1.0])|(8,[0,3,4,7],[1.0...|[-2.6443904754602...|[0.06633559238801...|       1.0|
|       0|     1|  male|18.0|    1|    0|   108.9|       C|     0.0|(1,[0],[1.0])|        1.0|(2,[1],[1.0])|[1.0,1.0,0.0,1.0,...|[-1.2121717674389...|[0.22931700732258...|       1.0|
|       0|     1|  male|22.0|    0|    0|135.6333|       C|     0.0|(1,[0],[1.0])|   

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [32]:
test_auc = BinaryClassificationEvaluator(labelCol="Survived").evaluate(results)

In [33]:
test_auc

0.8033453887884263