### Using pyspark to create a logistic regression model to predict whether a passenger will survive or not. 

By: Matt Purvis

In [0]:
# Import SparkSession
from pyspark.sql import SparkSession

In [0]:
# Create the Spark Session
spark = SparkSession.builder.appName('Titanic').getOrCreate()

In [0]:
# Load in the data using Spark SQL
df = spark.sql('select * from titanic_csv')

In [0]:
# Print the schema to get column names and datatypes
df.printSchema()

In [0]:
# Preview the dataset
df.show()

In [0]:
# Get list of column names
df.columns

In [0]:
# Filter the df down to columns we are interested in
my_cols = df.select([
 'Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [0]:
# Drop missing values - different strategies for handling missing data. We will drop for now. 
my_final_data = my_cols.na.drop()

In [0]:
# Import classes we will need to encode categorical variables and transform features
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [0]:
# Create objects to use later in the pipeline - will encode categorical variable sex
gender_indexer = StringIndexer(inputCol='Sex', outputCol = 'SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol = 'SexVec')

In [0]:
# Create objects to use later in the pipeline - will encode categorical variable embark
embark_indexer = StringIndexer(inputCol='Embarked', outputCol = 'EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol = 'EmbarkVec')

In [0]:
# Create assembler object that will transform the features into a vector - to be used in pipeline later
assembler = VectorAssembler(inputCols = ['Pclass','SexVec','EmbarkVec','Age','SibSp','Parch', 'Fare'],
                           outputCol='features')

In [0]:
# import LogisticRegression class
from pyspark.ml.classification import LogisticRegression

In [0]:
# Import pipeline class
from pyspark.ml import Pipeline

In [0]:
# Create logistic regression model
log_reg_titanic = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')

In [0]:
# Create pipeline that will encode categorical variables sex and embark as well as create the feature vector and the logistic regression model
pipeline = Pipeline(stages = [gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [0]:
# Split data into train and test - could take this further and create a validation set
train_data, test_data = my_final_data.randomSplit([.7,.3])

In [0]:
# Fit the pipeline to the training data
fit_model = pipeline.fit(train_data)

In [0]:
# Use pipeline to make same transformations on the test data
results = fit_model.transform(test_data)

In [0]:
# Import BinaryClassificationEvaluator to get the ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
# Create evaluator object
my_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived')

In [0]:
# Preview the transformed test data
results.select('Survived', 'prediction').show()

In [0]:
# Get the ROC/AUC to evaluate the model
AUC = my_eval.evaluate(results)

In [0]:
# Preview the ROC/AUC
AUC

A perfect model would have a 1.0 ROC/AUC. This model is not terrible but could be better!