In [1]:
import findspark

In [2]:
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('logR').getOrCreate()

In [6]:
from pyspark.ml.classification import LogisticRegression

In [7]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [9]:
data = spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [10]:
data.createOrReplaceTempView('data')

In [11]:
spark.sql("FROM data SELECT *").show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [12]:
data.printSchema()
# string variables: Sex, Name, Ticket, Cabin, Embarked

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [13]:
spark.sql("FROM data SELECT Sex, COUNT(Name) GROUP BY Sex").show() #convert to numeric

+------+-----------+
|   Sex|count(Name)|
+------+-----------+
|female|        314|
|  male|        577|
+------+-----------+



In [14]:
spark.sql("FROM data SELECT Ticket, COUNT(Name) GROUP BY Ticket").show() #not worth converting

+----------------+-----------+
|          Ticket|count(Name)|
+----------------+-----------+
|          367230|          2|
|       P/PP 3381|          2|
|          244270|          1|
|          363291|          3|
|SOTON/OQ 3101317|          1|
|           31418|          1|
|           26360|          2|
|            2700|          1|
|           14313|          1|
|          345763|          1|
|       A/5 21172|          1|
|          364500|          1|
|           19877|          2|
|          350029|          1|
|          113800|          1|
|      A/4. 39886|          1|
|       C.A. 2673|          2|
|          113807|          1|
|        PC 17604|          2|
|       C.A. 2315|          2|
+----------------+-----------+
only showing top 20 rows



In [15]:
spark.sql("FROM data SELECT Embarked, COUNT(Name) GROUP BY Embarked").show() #convert to numeric # 2 nulls - drop

+--------+-----------+
|Embarked|count(Name)|
+--------+-----------+
|       Q|         77|
|    null|          2|
|       C|        168|
|       S|        644|
+--------+-----------+



In [16]:
spark.sql("FROM data SELECT Cabin, COUNT(Name) GROUP BY Cabin").show() #not worth converting

+-------+-----------+
|  Cabin|count(Name)|
+-------+-----------+
|    A23|          1|
|    B79|          1|
|    E44|          2|
|  F E69|          1|
|    D28|          1|
|    C78|          2|
|    C95|          1|
|  F G73|          2|
|B58 B60|          2|
|     D7|          1|
|   C128|          1|
|    B39|          1|
|    B22|          2|
|   C110|          1|
|    D21|          1|
|     F2|          3|
|    B30|          1|
|   C104|          1|
|    B50|          1|
|     A6|          1|
+-------+-----------+
only showing top 20 rows



In [17]:
spark.sql("FROM data SELECT Pclass, COUNT(Name) GROUP BY Pclass").show()

+------+-----------+
|Pclass|count(Name)|
+------+-----------+
|     1|        216|
|     3|        491|
|     2|        184|
+------+-----------+



In [18]:
data.columns
# Features: Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
#Label: Survived

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [19]:
cols = data.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [20]:
#drop na
final_data = cols.na.drop()

In [21]:
#Index String
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer

In [22]:
#create gender and embarked indexing objects for pipeline
gender_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_index')
embarked_indexer = StringIndexer(inputCol='Embarked', outputCol='Embarked_index')

In [23]:
##create gender and embarked encoding objects for pipeline
#convert to binaries
gender_encoder = OneHotEncoder(inputCol='Sex_index', outputCol='SexVec')
embarked_encoder = OneHotEncoder(inputCol='Embarked_index', outputCol='EmbarkedVec')

In [24]:
#create assemble object for pipeline
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec'], \
                            outputCol='features')

What a pipieline does is it sets up a series of tasks as stages, especially when you have complicated data steps. <br>
If you have complex machine learning tasks, you will have to set up pipelines.

In [25]:
#Pipeline
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[gender_indexer,embarked_indexer, \
                           gender_encoder,embarked_encoder, \
                           assembler,log_reg_titanic])

In [26]:
train, test = final_data.randomSplit([0.7,0.3])

In [27]:
fit_model = pipeline.fit(train)

In [28]:
results = fit_model.transform(test)

In [29]:
results.show()

+--------+------+------+----+-----+-----+--------+--------+---------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|Sex_index|Embarked_index|       SexVec|  EmbarkedVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+--------+--------+---------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|  male|19.0|    3|    2|   263.0|       S|      0.0|           0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,19.0,3.0...|[-0.1793953450574...|[0.455271057582,0...|       1.0|
|       0|     1|  male|22.0|    0|    0|135.6333|       C|      0.0|           1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,22.0,0.0...|[-0.7627622857367...|[0.31804684474246...|       1.0|
|       0|     1|  male|24.0|    0|    1|247.5208|       C|      

In [30]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [31]:
#create evaluation instance
eval_results = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [32]:
eval_results.evaluate(results)

0.7264621033307106

Great Example: https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html