<a href="https://colab.research.google.com/github/prithvikavoori/PySparkMllib/blob/main/TitanicSurvivedClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this model we will be using a titanic data set to see if the passengers survived or not using logistic regression Mllib. the data can be found in with name titanic.csv

In [3]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [4]:
# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

In [5]:
# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

In [6]:
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


In [7]:
# install findspark using pip
!pip install -q findspark

In [9]:

import findspark
findspark.init()

In [10]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.appName('myproj').getOrCreate()

In [13]:
data = spark.read.csv('/content/titanic.csv',inferSchema=True,header=True)

In [14]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [15]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [16]:
my_cols = data.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [17]:
# dropping the missing data 

my_final_data = my_cols.na.drop()

In [18]:
# need to import this libraries to convert string in to number or categorical values 
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [19]:
# StringIndexer is used to convert string in to categorical value 
#oneHotEncoder is used to convert categorical value in to a vector 
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [20]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [21]:
# we only want these columns for our classification
assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'EmbarkVec'],outputCol='features')

In [22]:
# importing logistic regresion
from pyspark.ml.classification import LogisticRegression

In [23]:
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [26]:
from pyspark.ml import Pipeline

In [27]:
# creating a pipeline (the steps at which the model needs to perform)
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,log_reg_titanic])

In [28]:
#spliting the data 
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])

In [29]:
train_titanic_data.show()

+--------+------+------+----+-----+-----+--------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|
+--------+------+------+----+-----+-----+--------+--------+
|       0|     1|female| 2.0|    1|    2|  151.55|       S|
|       0|     1|female|25.0|    1|    2|  151.55|       S|
|       0|     1|  male|19.0|    1|    0|    53.1|       S|
|       0|     1|  male|19.0|    3|    2|   263.0|       S|
|       0|     1|  male|21.0|    0|    1| 77.2875|       S|
|       0|     1|  male|22.0|    0|    0|135.6333|       C|
|       0|     1|  male|24.0|    0|    0|    79.2|       C|
|       0|     1|  male|24.0|    0|    1|247.5208|       C|
|       0|     1|  male|27.0|    0|    2|   211.5|       C|
|       0|     1|  male|28.0|    1|    0| 82.1708|       C|
|       0|     1|  male|29.0|    0|    0|    30.0|       S|
|       0|     1|  male|31.0|    0|    0| 50.4958|       S|
|       0|     1|  male|31.0|    1|    0|    52.0|       S|
|       0|     1|  male|33.0|    0|    0

In [30]:
train_titanic_data.describe().show()

+-------+-------------------+------------------+------+------------------+------------------+------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|             Parch|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+------------------+-----------------+--------+
|  count|                520|               520|   520|               520|               520|               520|              520|     520|
|   mean|0.40576923076923077|              2.25|  null|          29.31475|0.5346153846153846|0.4326923076923077|36.81181019230768|    null|
| stddev|0.49151312185443347|0.8276298075020596|  null|14.701287045193146|0.9670654845462235|0.8324398648850077|58.52184258134458|    null|
|    min|                  0|                 1|female|              0.42|                 0|                 0|              0.0|       C|
|    max|           

In [32]:
test_titanic_data.describe().show()

+-------+------------------+------------------+------+------------------+------------------+------------------+------------------+--------+
|summary|          Survived|            Pclass|   Sex|               Age|             SibSp|             Parch|              Fare|Embarked|
+-------+------------------+------------------+------+------------------+------------------+------------------+------------------+--------+
|  count|               192|               192|   192|               192|               192|               192|               192|     192|
|   mean|0.4010416666666667|2.2135416666666665|  null|30.528645833333332|0.4583333333333333|0.4322916666666667|28.488238020833332|    null|
| stddev|0.4913907588019448|0.8629815346488312|  null|13.911668319584058|0.8239436094498386|0.9127364978377743| 32.79225912825016|    null|
|    min|                 0|                 1|female|               1.0|                 0|                 0|               0.0|       C|
|    max|           

In [33]:
#fitting the model on training data 
fit_model = pipeline.fit(train_titanic_data)

In [34]:
results = fit_model.transform(test_titanic_data)

In [36]:
results.show()

+--------+------+------+----+-----+-----+--------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+--------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|50.0|    0|    0| 28.7125|       C|     1.0|        1.0|    (1,[],[])|(2,[1],[1.0])|(8,[0,2,5,7],[1.0...|[-2.3460484847928...|[0.08738037283475...|       1.0|
|       0|     1|  male|18.0|    1|    0|   108.9|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,18.0,1.0...|[-0.9291927716263...|[0.28308851233696...|       1.0|
|       0|     1|  male|28.0|    0|    0|    47.1|       S|     0.0|        0.0|(1,[0

In [37]:
# Evaluating a Model 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [38]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [39]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [40]:
#area under curve 
AUC = my_eval.evaluate(results)

In [41]:
AUC

0.8352343308865049