# Logistic Regression Code
This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language.

In [1]:
from pyspark.sql import *
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)

In [2]:
data = sqlContext.read.format("com.databricks.spark.csv").options(header='true').options(inferSchema='true').load("file:/titanic.csv")


In [3]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [15]:
my_cols = data.select(['Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked'
])

In [16]:
my_final_data_drop = my_cols.na.drop()
from pyspark.sql.types import DoubleType
my_final_data = my_final_data_drop.withColumn("Survived", my_final_data_drop.Survived.cast(DoubleType()))
my_final_data.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Working with Categorical Columns

Let's break this down into multiple steps to make it all clear.

In [17]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [18]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [19]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [20]:
assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'EmbarkVec'],outputCol='features')

In [21]:
from pyspark.ml.classification import LogisticRegression

In [22]:
from pyspark.ml import Pipeline

## Pipelines 

Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)

In [23]:
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [24]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,log_reg_titanic])

In [25]:
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.7,.3])
train_titanic_data.show()

+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|     0.0|     1|Baxter, Mr. Quigg...|  male|24.0|    0|    1|   PC 17558|247.5208|    B58 B60|       C|
|     0.0|     1|Blackwell, Mr. St...|  male|45.0|    0|    0|     113784|    35.5|          T|       S|
|     0.0|     1|Carrau, Mr. Franc...|  male|28.0|    0|    0|     113059|    47.1|           |       S|
|     0.0|     1|Chaffee, Mr. Herb...|  male|46.0|    1|    0|W.E.P. 5734|  61.175|        E31|       S|
|     0.0|     1|   Fortune, Mr. Mark|  male|64.0|    1|    4|      19950|   263.0|C23 C25 C27|       S|
|     0.0|     1|Futrelle, Mr. Jac...|  male|37.0|    1|    0|     113803|    53.1|       C123|       S|
|     0.0|     1|  Giglio, Mr. Victor|  male|24.0|    0

In [26]:
fit_model = pipeline.fit(train_titanic_data)

In [27]:
results = fit_model.transform(test_titanic_data)



In [28]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|     0.0|       1.0|
|     0.0|       1.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
+--------+----------+
only showing top 20 rows



In [29]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|     0.0|       1.0|
|     0.0|       1.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
|     0.0|       0.0|
+--------+----------+
only showing top 20 rows



## Great Job!