In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('logReg').getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
import os

In [5]:
data_file = os.path.join(os.curdir, 'data', 'sample_libsvm_data.txt')

In [8]:
data = spark.read.format('libsvm').load(data_file)

In [9]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
lgr = LogisticRegression()

In [11]:
logreg_model = lgr.fit(data)

In [12]:
lgr_summary = logreg_model.summary

In [13]:
lgr_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
lgr_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[19.8534775947478...|[0.99999999761359...|       0.0|
|  1.0|(692,[158,159,160...|[-20.377398194908...|[1.41321555111056...|       1.0|
|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865126979...|       1.0|
|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170303...|       1.0|
|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200604...|       1.0|
|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|
|  1.0|(692,[158,159,160...|[-20.337256674833...|[1.47109814695581...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102631...|       1.0|
|  0.0|(692,[154,155,156...|[19.2708803215613...|[0.99999999572670...|       0.0|
|  0.0|(692,[127

In [15]:
training_data, testing_data = data.randomSplit([0.75, 0.25])

In [16]:
final_model = LogisticRegression()

In [17]:
fit_final = final_model.fit(training_data)

In [18]:
predictions_labels = fit_final.evaluate(testing_data)

In [21]:
predictions_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[122,123,124...|[16.8271508546675...|[0.99999995078911...|       0.0|
|  0.0|(692,[123,124,125...|[21.3462640255793...|[0.99999999946366...|       0.0|
|  0.0|(692,[124,125,126...|[22.8077895070705...|[0.99999999987563...|       0.0|
|  0.0|(692,[126,127,128...|[32.9015723202867...|[0.99999999999999...|       0.0|
|  0.0|(692,[126,127,128...|[16.1392027962179...|[0.99999990208860...|       0.0|
|  0.0|(692,[127,128,129...|[22.1647752653192...|[0.99999999976342...|       0.0|
|  0.0|(692,[150,151,152...|[24.6271202198179...|[0.99999999997983...|       0.0|
|  0.0|(692,[153,154,155...|[27.2317158267952...|[0.99999999999850...|       0.0|
|  0.0|(692,[154,155,156...|[10.6382487871988...|[0.99997601957697...|       0.0|
|  0.0|(692,[181

In [22]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                  MulticlassClassificationEvaluator)

In [23]:
my_eval = BinaryClassificationEvaluator()

In [24]:
my_final_roc = my_eval.evaluate(predictions_labels.predictions)

In [25]:
my_final_roc # means everything was correctly predicted and highly separable

1.0

In [27]:
titanic_data_file = os.path.join(os.curdir, 'data', 'titanic.csv')

In [30]:
df = spark.read.csv(titanic_data_file, inferSchema=True,
                   header=True)

In [31]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [32]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [33]:
my_cols = df.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked']
)

In [34]:
my_final_data = my_cols.na.drop()

In [35]:
from pyspark.ml.feature import (
    VectorAssembler, VectorIndexer,
    OneHotEncoder, StringIndexer
)

In [37]:
gender_indexer = StringIndexer(
    inputCol='Sex', outputCol='SexIndex')

# A B C
# 0 1 2
# One hot encoding
# Example A --> 1
# [1, 0, 0] i.e A=1, B=0, C=0
gender_encoder = OneHotEncoder(inputCol='SexIndex', 
                               outputCol='SexVec')

In [38]:
embark_indexer = StringIndexer(
    inputCol='Embarked', outputCol='EmbarkIndex')

embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', 
                               outputCol='EmbarkVec')

In [39]:
assembler = VectorAssembler(
    inputCols=[
        'Pclass',
        'SexVec',
        'EmbarkVec',
        'Age',
        'SibSp',
        'Parch',
        'Fare'
    ],
    outputCol='features'
)

In [40]:
from pyspark.ml import Pipeline

In [41]:
lgr_titanic = LogisticRegression(
    featuresCol='features',
    labelCol = 'Survived'
)

In [43]:
pipeline = Pipeline(stages = [
    gender_indexer,
    embark_indexer,
    gender_encoder,
    embark_encoder,
    assembler,
    lgr_titanic
])

In [44]:
training_data, test_data = my_final_data.randomSplit(
    [0.7, 0.3]
)

In [45]:
fit_model = pipeline.fit(training_data)

In [56]:
result = fit_model.transform(test_data)

In [57]:
result.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [58]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [61]:
my_eval = BinaryClassificationEvaluator(
    rawPredictionCol='prediction',
    labelCol='Survived'
)
