In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import findspark
findspark.init("/home/rajdeep/spark-3.5.0-bin-hadoop3")

In [3]:
from pyspark.sql import SparkSession

In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer,VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
#initalizing the spark object
spark = SparkSession.builder.appName("titanic").getOrCreate()

23/11/25 21:16:19 WARN Utils: Your hostname, DESKTOP-CSFBOLK resolves to a loopback address: 127.0.1.1; using 172.19.12.103 instead (on interface eth0)
23/11/25 21:16:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/25 21:16:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
#reading the data into dataframe
df = spark.read.csv("data/titanic.csv",header=True, inferSchema=True)

                                                                                

In [7]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [8]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [10]:
#selecting the necessary columns
df = df.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])

In [11]:
#dropping null data
df = df.na.drop()

In [12]:
#initializing the StringIndexer to convert string to int 
genderIndexer = StringIndexer(inputCol = 'Sex', outputCol='sexIndex')
#initializing OneHotEncoder to encode the index's data so as no value is given higher weightage over other
genderEncoder = OneHotEncoder(inputCol='sexIndex',outputCol='sexVec')

In [13]:
#initializing the StringIndexer to convert string to int
embarkedIndexer = StringIndexer(inputCol = 'Embarked', outputCol='embarkedIndex')
#initializing OneHotEncoder to encode the index's data so as no value is given higher weightage over other
embarkedEncoder = OneHotEncoder(inputCol='embarkedIndex',outputCol='embarkedVec')

In [14]:
#initializing the VectorAssembler to create feature vector for ML model
assembler = VectorAssembler(inputCols=['Pclass','Age','SibSp','Parch','Fare', 'sexVec','embarkedVec'], outputCol='features')

In [15]:
#initializing the LogisticRegression object
lr_model = LogisticRegression(featuresCol='features',labelCol='Survived')

In [16]:
# creating the pipeline object so as same steps will be performed on both train test data
pipeline = Pipeline(stages=[genderIndexer,genderEncoder,embarkedIndexer,embarkedEncoder,assembler,lr_model])

In [17]:
# splitting the available data into train - test df
train_df, test_df =  df.randomSplit([0.7,0.3])

In [18]:
# fitting the pipeline
model = pipeline.fit(train_df)

23/11/25 21:16:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [19]:
# predicting on test data
result_df = model.transform(test_df)

In [20]:
#selecting the necessary columns
result_df = result_df.select(['Survived','prediction'])

In [21]:
# evaluating the result
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [22]:
# getting the AUC
auc = evaluator.evaluate(result_df)

In [23]:
auc

0.805089219723366

In [24]:
result_df.show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows

