In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.3.2-bin-hadoop2.7')
import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logr_example').getOrCreate()

In [2]:
from pyspark.ml.classification import LogisticRegression

In [7]:
data = spark.read.csv('../Python-and-Spark-for-Big-Data/Spark_for_Machine_Learning/Logistic_Regression/titanic.csv', 
                      inferSchema=True, header=True)

In [8]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [9]:
cols = data.select([
 'Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [10]:
final_data = cols.na.drop()

In [11]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [19]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [20]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

In [21]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkedVec', 'Age', 'SibSp', 'Parch', 'Fare'], 
                            outputCol='features')

In [22]:
from pyspark.ml import Pipeline

In [23]:
logr_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [24]:
pipeline = Pipeline(stages=[
        gender_indexer, embark_indexer,
        gender_encoder, embark_encoder,
        assembler, logr_titanic
    ])

In [25]:
train, test = final_data.randomSplit([.7, .3])

In [26]:
fitted_model = pipeline.fit(train)

In [27]:
results = fitted_model.transform(test)

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [29]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [30]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [31]:
area_under_curve = evaluator.evaluate(results)

In [32]:
area_under_curve

0.7702324133637685