<a href="https://colab.research.google.com/github/sk-monirul-islam-1/Titanic_survival-prediction_using_pipelines/blob/main/Pipeline_with_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from datetime import datetime
import pytz
print('Time to do something good',datetime.now(pytz.timezone('Asia/Calcutta')))

Time to do something good 2024-01-02 21:27:30.389737+05:30


#Installation of spark

In [2]:
!pip3 -q install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Praxis').master("local[*]").getOrCreate()
sc = spark.sparkContext
#sc

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


## Importing necessary libraries

In [3]:
from pyspark.sql.functions import mean
from pyspark.ml.feature import (VectorAssembler,OneHotEncoder, StringIndexer)
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator


# Data

## Data Collection

In [4]:
!wget -O titanic_dataset.csv -q https://github.com/datasciencedojo/datasets/raw/master/titanic.csv

In [5]:
#loading dataset into spark dataframe
DF = spark.read.csv('titanic_dataset.csv', inferSchema=True,header=True)
DF.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

## Data Understanding


In [6]:
print((DF.count(),len(DF.columns)))

(891, 12)


In [7]:
DF.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
#descriptive analysis
DF.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [9]:
#count of different fares
DF.groupBy('Fare').count().show()

+-------+-----+
|   Fare|count|
+-------+-----+
| 8.5167|    1|
|   15.5|    8|
| 133.65|    2|
| 29.125|    5|
|10.4625|    2|
| 7.0458|    1|
|  9.475|    1|
|11.1333|    3|
|    0.0|   15|
| 7.7333|    4|
|   73.5|    5|
|77.2875|    2|
|   15.9|    2|
|   11.5|    4|
| 8.6833|    1|
|41.5792|    3|
|    9.5|    9|
| 8.4042|    1|
|14.4542|    7|
|14.4583|    3|
+-------+-----+
only showing top 20 rows



## Using Spark - SQL

In [10]:
# temporary view of the data
DF.createOrReplaceTempView('DF_T')

In [11]:
#group by gender
spark.sql(\
          "SELECT \
           Sex, count(Sex) as count_gender, \
           count(Sex)*100/sum(count(Sex)) over() as percent  \
           FROM DF_T GROUP BY Sex" \
           ).show()

+------+------------+-----------------+
|   Sex|count_gender|          percent|
+------+------------+-----------------+
|female|         314|35.24130190796858|
|  male|         577|64.75869809203142|
+------+------------+-----------------+



In [12]:
#group by gender having fare more than 100
spark.sql(\
          "SELECT Sex, count(Sex), \
          round((COUNT(Sex) * 100.0) /(SELECT count(Sex) FROM DF_T ),2) as percentage \
          FROM DF_T WHERE Fare > 100  GROUP BY Sex"\
          ).show()

+------+----------+----------+
|   Sex|count(Sex)|percentage|
+------+----------+----------+
|female|        34|      3.82|
|  male|        19|      2.13|
+------+----------+----------+



In [13]:
#group by gender having age exactly 20
spark.sql(\
          "SELECT Sex, count(Sex), \
          round((COUNT(Sex) * 100.0) /(SELECT count(Sex) FROM DF_T ),2) as percentage \
          FROM DF_T WHERE Age = 20  GROUP BY Sex"\
          ).show()

+------+----------+----------+
|   Sex|count(Sex)|percentage|
+------+----------+----------+
|female|         2|      0.22|
|  male|        13|      1.46|
+------+----------+----------+



In [14]:
#group by gender with Pclass exactly 1
spark.sql(\
          "SELECT Sex, count(Sex), \
          round((COUNT(Sex) * 100.0) /(SELECT count(Sex) FROM DF_T ),2) as percentage \
          FROM DF_T WHERE  Pclass = 1  GROUP BY Sex"\
          ).show()

+------+----------+----------+
|   Sex|count(Sex)|percentage|
+------+----------+----------+
|female|        94|     10.55|
|  male|       122|     13.69|
+------+----------+----------+



In [15]:
#count of different types of pclass
DF.groupBy('Pclass').count().show()

+------+-----+
|Pclass|count|
+------+-----+
|     1|  216|
|     3|  491|
|     2|  184|
+------+-----+



In [16]:
#ticket vs count of fare amount
spark.sql("SELECT Ticket, count(Fare) as fare_amt_count FROM DF_T WHERE Pclass == 2 GROUP BY Ticket ORDER BY fare_amt_count DESC").show()

+-------------+--------------+
|       Ticket|fare_amt_count|
+-------------+--------------+
| S.O.C. 14879|             5|
|       239853|             3|
|   C.A. 34651|             3|
|       230080|             3|
| F.C.C. 13529|             3|
|        29106|             3|
|SC/Paris 2123|             3|
|   C.A. 31921|             3|
|       248727|             3|
|        11668|             2|
|        31027|             2|
|       250647|             2|
|       244252|             2|
|   C.A. 33112|             2|
|        28403|             2|
|       239865|             2|
|         2908|             2|
|       230433|             2|
|       248738|             2|
|       231919|             2|
+-------------+--------------+
only showing top 20 rows



## Data Preprocessing

In [17]:
#checking null values
DF.toPandas().isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [18]:
#droping unwanted columns
DF = DF.drop('Cabin')
DF = DF.dropna(subset=['Embarked'])
DF = DF.drop('PassengerId')
DF = DF.drop('Name')
DF = DF.drop('Ticket')

In [19]:
#filling null values of age column
cmean = DF.select(mean(DF['Age'])).collect()
meanage = cmean[0][0]
DF = DF.na.fill(meanage,['Age'])

In [20]:
#checking null values again after filling null values
DF.toPandas().isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [21]:
DF.describe().show()
# note, mean BMI has not changed, but std of BMI has reduced, as expected

+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|                889|               889|   889|               889|               889|                889|              889|     889|
|   mean|0.38245219347581555|2.3115860517435323|  NULL|29.642092696629106|0.5241844769403825|0.38245219347581555|32.09668087739029|    NULL|
| stddev|0.48625968831477334|0.8346997785705753|  NULL|12.968346294351782| 1.103704875596923| 0.8067607445174785|49.69750431670795|    NULL|
|    min|                  0|                 1|female|              0.42|                 0|                  0|              0.0|       C|
|    max|    

In [22]:
DF.dtypes

[('Survived', 'int'),
 ('Pclass', 'int'),
 ('Sex', 'string'),
 ('Age', 'double'),
 ('SibSp', 'int'),
 ('Parch', 'int'),
 ('Fare', 'double'),
 ('Embarked', 'string')]

## String Indexer

In [23]:
# There are two categorical varibales sex and smoking Embarked
# indexing all categorical columns in the dataset

SexIndexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
EmbarkedIndexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkedIndex")

In [24]:
DF.show(5)

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



In [25]:
# Using any one String Indexer
EmbarkedIndexer.fit(DF).transform(DF).show(5)

+--------+------+------+----+-----+-----+-------+--------+-------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|EmbarkedIndex|
+--------+------+------+----+-----+-----+-------+--------+-------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|          0.0|
|       1|     1|female|38.0|    1|    0|71.2833|       C|          1.0|
|       1|     3|female|26.0|    0|    0|  7.925|       S|          0.0|
|       1|     1|female|35.0|    1|    0|   53.1|       S|          0.0|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|          0.0|
+--------+------+------+----+-----+-----+-------+--------+-------------+
only showing top 5 rows



In [26]:
# Putting TWO indexers(Pipeline construction)
SexIndexer.fit(EmbarkedIndexer.fit(DF).transform(DF)).transform(EmbarkedIndexer.fit(DF).transform(DF)).show(5)

+--------+------+------+----+-----+-----+-------+--------+-------------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|EmbarkedIndex|SexIndex|
+--------+------+------+----+-----+-----+-------+--------+-------------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|          0.0|     0.0|
|       1|     1|female|38.0|    1|    0|71.2833|       C|          1.0|     1.0|
|       1|     3|female|26.0|    0|    0|  7.925|       S|          0.0|     1.0|
|       1|     1|female|35.0|    1|    0|   53.1|       S|          0.0|     1.0|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|          0.0|     0.0|
+--------+------+------+----+-----+-----+-------+--------+-------------+--------+
only showing top 5 rows



## Using One Hot Encoder for categorical variables

In [27]:
OHE_Sex = OneHotEncoder(inputCols=["SexIndex"], outputCols=["SexVec"])
OHE_Sex.fit(SexIndexer.fit(DF).transform(DF)).transform(SexIndexer.fit(DF).transform(DF)).show(5)

+--------+------+------+----+-----+-----+-------+--------+--------+-------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|SexIndex|       SexVec|
+--------+------+------+----+-----+-----+-------+--------+--------+-------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|     0.0|(1,[0],[1.0])|
|       1|     1|female|38.0|    1|    0|71.2833|       C|     1.0|    (1,[],[])|
|       1|     3|female|26.0|    0|    0|  7.925|       S|     1.0|    (1,[],[])|
|       1|     1|female|35.0|    1|    0|   53.1|       S|     1.0|    (1,[],[])|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|     0.0|(1,[0],[1.0])|
+--------+------+------+----+-----+-----+-------+--------+--------+-------------+
only showing top 5 rows



In [28]:
OHE_Embarked = OneHotEncoder(inputCols=["EmbarkedIndex"], outputCols=["EmbarkedVec"])
OHE_Embarked.fit(EmbarkedIndexer.fit(DF).transform(DF)).transform(EmbarkedIndexer.fit(DF).transform(DF)).show(5)

+--------+------+------+----+-----+-----+-------+--------+-------------+-------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|EmbarkedIndex|  EmbarkedVec|
+--------+------+------+----+-----+-----+-------+--------+-------------+-------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|          0.0|(2,[0],[1.0])|
|       1|     1|female|38.0|    1|    0|71.2833|       C|          1.0|(2,[1],[1.0])|
|       1|     3|female|26.0|    0|    0|  7.925|       S|          0.0|(2,[0],[1.0])|
|       1|     1|female|35.0|    1|    0|   53.1|       S|          0.0|(2,[0],[1.0])|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|          0.0|(2,[0],[1.0])|
+--------+------+------+----+-----+-----+-------+--------+-------------+-------------+
only showing top 5 rows



In [29]:
# Simultaneous Encoding of Both
OHE_Sex_Embarked = OneHotEncoder(inputCols=["SexIndex","EmbarkedIndex"],
                                 outputCols=["SexVec","EmbarkedVec"])

In [30]:
OHE_Sex_Embarked.fit(SexIndexer.fit(EmbarkedIndexer.fit(DF).transform(DF)).transform(EmbarkedIndexer.fit(DF).transform(DF))).transform(SexIndexer.fit(EmbarkedIndexer.fit(DF).transform(DF)).transform(EmbarkedIndexer.fit(DF).transform(DF))).show(5)

+--------+------+------+----+-----+-----+-------+--------+-------------+--------+-------------+-------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|EmbarkedIndex|SexIndex|       SexVec|  EmbarkedVec|
+--------+------+------+----+-----+-----+-------+--------+-------------+--------+-------------+-------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|          0.0|     0.0|(1,[0],[1.0])|(2,[0],[1.0])|
|       1|     1|female|38.0|    1|    0|71.2833|       C|          1.0|     1.0|    (1,[],[])|(2,[1],[1.0])|
|       1|     3|female|26.0|    0|    0|  7.925|       S|          0.0|     1.0|    (1,[],[])|(2,[0],[1.0])|
|       1|     1|female|35.0|    1|    0|   53.1|       S|          0.0|     1.0|    (1,[],[])|(2,[0],[1.0])|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|          0.0|     0.0|(1,[0],[1.0])|(2,[0],[1.0])|
+--------+------+------+----+-----+-----+-------+--------+-------------+--------+-------------+-------------+
only showi

## Assembler

In [31]:
#taking all the columns except Survived as features
F_assembler = VectorAssembler(inputCols=['SexVec',
 'SibSp',
 'Parch',
 'Fare',
 'Pclass',
 'Age',
 'EmbarkedVec'],outputCol='features')

In [32]:
#transformation
F_assembler.transform(OHE_Sex_Embarked.fit(SexIndexer.fit(EmbarkedIndexer.fit(DF).transform(DF)).transform(EmbarkedIndexer.fit(DF).transform(DF))).transform(SexIndexer.fit(EmbarkedIndexer.fit(DF).transform(DF)).transform(EmbarkedIndexer.fit(DF).transform(DF)))).show(5)

+--------+------+------+----+-----+-----+-------+--------+-------------+--------+-------------+-------------+--------------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|EmbarkedIndex|SexIndex|       SexVec|  EmbarkedVec|            features|
+--------+------+------+----+-----+-----+-------+--------+-------------+--------+-------------+-------------+--------------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|          0.0|     0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,0.0,7.25...|
|       1|     1|female|38.0|    1|    0|71.2833|       C|          1.0|     1.0|    (1,[],[])|(2,[1],[1.0])|[0.0,1.0,0.0,71.2...|
|       1|     3|female|26.0|    0|    0|  7.925|       S|          0.0|     1.0|    (1,[],[])|(2,[0],[1.0])|(8,[3,4,5,6],[7.9...|
|       1|     1|female|35.0|    1|    0|   53.1|       S|          0.0|     1.0|    (1,[],[])|(2,[0],[1.0])|[0.0,1.0,0.0,53.1...|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|          0.0|     0.0|(1

# Data Partitioning for Training and Testing

In [33]:
# splitting training and validation data
train_df,val_df = DF.randomSplit([0.7,0.3])
print(train_df.count())
print(val_df.count())

617
272


#Apply Different Techniques

In [34]:
basePipe = Pipeline(stages=[SexIndexer, EmbarkedIndexer, OHE_Sex_Embarked, F_assembler])
basePipe.fit(DF).transform(DF).show(5)

+--------+------+------+----+-----+-----+-------+--------+--------+-------------+-------------+-------------+--------------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|SexIndex|EmbarkedIndex|       SexVec|  EmbarkedVec|            features|
+--------+------+------+----+-----+-----+-------+--------+--------+-------------+-------------+-------------+--------------------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|     0.0|          0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,0.0,7.25...|
|       1|     1|female|38.0|    1|    0|71.2833|       C|     1.0|          1.0|    (1,[],[])|(2,[1],[1.0])|[0.0,1.0,0.0,71.2...|
|       1|     3|female|26.0|    0|    0|  7.925|       S|     1.0|          0.0|    (1,[],[])|(2,[0],[1.0])|(8,[3,4,5,6],[7.9...|
|       1|     1|female|35.0|    1|    0|   53.1|       S|     1.0|          0.0|    (1,[],[])|(2,[0],[1.0])|[0.0,1.0,0.0,53.1...|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|     0.0|          0.0|(1

## Logistic Regression Model Pipeline

In [35]:
lr = LogisticRegression(labelCol='Survived',featuresCol='features',maxIter=5)
lr_pipeline = Pipeline(stages=[basePipe,lr])
lr_model = lr_pipeline.fit(train_df)
lr_predictions=lr_model.transform(val_df)

In [36]:
lr_predictions.show(5)

+--------+------+----+-----------------+-----+-----+-------+--------+--------+-------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass| Sex|              Age|SibSp|Parch|   Fare|Embarked|SexIndex|EmbarkedIndex|       SexVec|  EmbarkedVec|            features|       rawPrediction|         probability|prediction|
+--------+------+----+-----------------+-----+-----+-------+--------+--------+-------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|male|             18.0|    1|    0|  108.9|       C|     0.0|          1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,0.0,108....|[-0.8418351196440...|[0.30114842741239...|       1.0|
|       0|     1|male|             19.0|    3|    2|  263.0|       S|     0.0|          0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,3.0,2.0,263....|[0.38857473921299...|[0.59593954974641...|       0.0|
|       0|     1|male|   

In [37]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
evaluator = BinaryClassificationEvaluator(labelCol='Survived')

In [38]:
lr_acc=acc_evaluator.evaluate(lr_predictions)
print(round(lr_acc,3), 'is the accuray of the LR pipeline')
lr_auroc = evaluator.evaluate(lr_predictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(lr_auroc))



0.816 is the accuray of the LR pipeline
Area under ROC Curve: 0.8348


## Random Forest Pipeline

In [39]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'Survived')
rf_pipeline = Pipeline(stages=[basePipe, rf])
rf_model = rf_pipeline.fit(train_df)
rf_predictions=rf_model.transform(val_df)

In [40]:
rf_acc=acc_evaluator.evaluate(rf_predictions)
print('A Random Forest algorithm had an accuracy of: {0:2.2f}%'.format(rf_acc*100))
rf_auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(rf_auroc))

A Random Forest algorithm had an accuracy of: 83.46%
Area under ROC Curve: 0.8426


## Gradient Boost Model Pipeline

In [41]:
gbt = GBTClassifier(labelCol='Survived',featuresCol='features')
gbt_pipeline = Pipeline(stages=[basePipe, gbt])
gbt_model = gbt_pipeline.fit(train_df)
gbt_predictions = gbt_model.transform(val_df)

In [42]:
gbt_acc = acc_evaluator.evaluate(gbt_predictions)
print('Gradient Boost algorithm had an accuracy of: {0:2.2f}%'.format(gbt_acc*100))
gbt_auroc = evaluator.evaluate(gbt_predictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(gbt_auroc))

Gradient Boost algorithm had an accuracy of: 82.72%
Area under ROC Curve: 0.8433


# Comparison of the Models


In [43]:
print(round(lr_acc,3), 'is the accuray of the LR pipeline')
print(round(rf_acc,3), 'is the accuray of the RF pipeline')
print(round(gbt_acc,3), 'is the accuray of the GBT pipeline')

print(round(lr_auroc,3), 'is area under ROC curve of the LR pipeline')
print(round(rf_auroc,3), 'is area under ROC curve of the RF pipeline')
print(round(gbt_auroc,3), 'is area under ROC curve of the GBT pipeline')

0.816 is the accuray of the LR pipeline
0.835 is the accuray of the RF pipeline
0.827 is the accuray of the GBT pipeline
0.835 is area under ROC curve of the LR pipeline
0.843 is area under ROC curve of the RF pipeline
0.843 is area under ROC curve of the GBT pipeline


# Transform Once, Employ Various Models



In [44]:
commonEstimator = Pipeline(stages=[SexIndexer, EmbarkedIndexer, OHE_Sex_Embarked, F_assembler])
commonModel = commonEstimator.fit(train_df)

commonTrainHeart = commonModel.transform(train_df)
commonTrainHeart.show(5)

+--------+------+------+----+-----+-----+-------+--------+--------+-------------+-------------+-------------+--------------------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|SexIndex|EmbarkedIndex|       SexVec|  EmbarkedVec|            features|
+--------+------+------+----+-----+-----+-------+--------+--------+-------------+-------------+-------------+--------------------+
|       0|     1|female| 2.0|    1|    2| 151.55|       S|     1.0|          0.0|    (1,[],[])|(2,[0],[1.0])|[0.0,1.0,2.0,151....|
|       0|     1|female|25.0|    1|    2| 151.55|       S|     1.0|          0.0|    (1,[],[])|(2,[0],[1.0])|[0.0,1.0,2.0,151....|
|       0|     1|female|50.0|    0|    0|28.7125|       C|     1.0|          1.0|    (1,[],[])|(2,[1],[1.0])|(8,[3,4,5,7],[28....|
|       0|     1|  male|19.0|    1|    0|   53.1|       S|     0.0|          0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,0.0,53.1...|
|       0|     1|  male|21.0|    0|    1|77.2875|       S|     0.0|          0.0|(1

##Logistic Regression

In [45]:
lr0 = LogisticRegression(labelCol='Survived',featuresCol='features',maxIter=5)
lr0Estimator = Pipeline(stages=[lr0])
lr0model = lr0Estimator.fit(commonTrainHeart)

lr1model = PipelineModel(stages = [commonModel , lr0model])         # Adding two models together
lr1model_predictions = lr1model.transform(val_df)

In [46]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
lr1_acc=acc_evaluator.evaluate(lr1model_predictions)

print(round(lr1_acc,3), 'is the accuray of the new LR pipeline')
evaluator = BinaryClassificationEvaluator(labelCol='Survived')
lr1_auroc = evaluator.evaluate(lr1model_predictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(lr1_auroc))


0.816 is the accuray of the new LR pipeline
Area under ROC Curve: 0.8348


## Random Forest

In [47]:
rf0 = RandomForestClassifier(featuresCol = 'features', labelCol = 'Survived')
rf0Estimator = Pipeline(stages=[rf0])
rf0model = rf0Estimator.fit(commonTrainHeart)

rf1model = PipelineModel(stages = [commonModel , rf0model])         # Adding two models together
rf1model_predictions = rf1model.transform(val_df)


In [48]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
rf1_acc=acc_evaluator.evaluate(rf1model_predictions)

print(round(rf1_acc,3), 'is the accuray of the new RF pipeline')
evaluator = BinaryClassificationEvaluator(labelCol='Survived')
rf1_auroc = evaluator.evaluate(rf1model_predictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(rf1_auroc))

0.835 is the accuray of the new RF pipeline
Area under ROC Curve: 0.8426


##Gradient Boost

In [49]:
gbt0 = GBTClassifier(featuresCol = 'features', labelCol = 'Survived')
gbt0Estimator = Pipeline(stages=[gbt0])
gbt0model = gbt0Estimator.fit(commonTrainHeart)

gbt1model = PipelineModel(stages = [commonModel , gbt0model])         # Adding two models together
gbt1model_predictions = gbt1model.transform(val_df)

In [50]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
gbt1_acc=acc_evaluator.evaluate(gbt1model_predictions)

print(round(gbt1_acc,3), 'is the accuray of the new GBT pipeline')
evaluator = BinaryClassificationEvaluator(labelCol='Survived')
gbt1_auroc = evaluator.evaluate(gbt1model_predictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(gbt1_auroc))

0.827 is the accuray of the new GBT pipeline
Area under ROC Curve: 0.8433


In [51]:
!date
from datetime import datetime
import pytz
print('Time to disconnect',datetime.now(pytz.timezone('Asia/Calcutta')))

Tue Jan  2 03:59:27 PM UTC 2024
Time to disconnect 2024-01-02 21:29:27.746446+05:30
