In [1]:
import findspark
findspark.init('/home/siddharth/spark-2.4.1-bin-hadoop2.7/')

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('log_reg_titanic').getOrCreate()

In [3]:
data = spark.read.csv('./Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/titanic.csv',
                     inferSchema = True,header = True)

In [4]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [6]:
final_data = data.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [7]:
final_data.describe().show()

+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|summary|           Survived|            Pclass|   Sex|               Age|             SibSp|              Parch|             Fare|Embarked|
+-------+-------------------+------------------+------+------------------+------------------+-------------------+-----------------+--------+
|  count|                891|               891|   891|               714|               891|                891|              891|     889|
|   mean| 0.3838383838383838| 2.308641975308642|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824| 32.2042079685746|    null|
| stddev|0.48659245426485753|0.8360712409770491|  null|14.526497332334035|1.1027434322934315| 0.8060572211299488|49.69342859718089|    null|
|    min|                  0|                 1|female|              0.42|                 0|                  0|              0.0|       C|
|    max|    

In [8]:
final_data = final_data.na.drop()

## Working with categorical data

In [9]:
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder

In [10]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='sex_index')
gender_encoder = OneHotEncoder(inputCol='sex_index',outputCol='sex_ohe')

In [11]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='embarked_index')
embark_encoder = OneHotEncoder(inputCol='embarked_index',outputCol='embarked_ohe')

In [12]:
assembler = VectorAssembler(inputCols=['Pclass',
 'sex_ohe',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'embarked_ohe'],outputCol = 'features')

In [13]:
from pyspark.ml.classification import LogisticRegression

## Working with piplines

In [14]:
from pyspark.ml import Pipeline

In [15]:
log_reg = LogisticRegression(labelCol='Survived')

In [16]:
pipline = Pipeline(stages=[gender_indexer,gender_encoder,embark_indexer,embark_encoder,assembler,log_reg])

In [17]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [18]:
fit_model = pipline.fit(train_data)

In [19]:
result = fit_model.transform(test_data)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [22]:
result

DataFrame[Survived: int, Pclass: int, Sex: string, Age: double, SibSp: int, Parch: int, Fare: double, Embarked: string, sex_index: double, sex_ohe: vector, embarked_index: double, embarked_ohe: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [23]:
auc = my_eval.evaluate(result.select('Survived','prediction'))

In [24]:
auc

0.7957517470421366