In [1]:
train_df = sqlContext.read.load('/FileStore/tables/train.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

test_df = sqlContext.read.load('/FileStore/tables/test.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

In [2]:
## Add Survived column to test
from pyspark.sql.functions import lit, col
train_df = train_df.withColumn('Mark',lit('train'))
test_df = (test_df.withColumn('Survived',lit(0))
                  .withColumn('Mark',lit('test')))
test_df = test_df[train_df.columns]
## Append Test data to Train data
df = train_df.unionAll(test_df)

In [3]:
###One-Hot Encoding
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["Sex", "Embarked"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

In [4]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "Survived", outputCol = "label")
stages += [label_stringIdx]

In [5]:
# Transform all features into a vector using VectorAssembler
numericCols = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [6]:
cols = df.columns
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(df)
dataset = pipelineModel.transform(df)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)
#display(dataset)
#type(dataset)
dataset.toPandas()

In [7]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

In [8]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [9]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

In [10]:
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "Age", "Fare", "Pclass", "Sex", "SibSp", "Parch", "Embarked")

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
lr_value = evaluator.evaluate(predictions)
print lr_value

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

In [13]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)

In [14]:
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "Age", "Fare", "Pclass", "Sex", "SibSp", "Parch", "Embarked")

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
dt_value = evaluator.evaluate(predictions)
print dt_value

In [16]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

In [17]:
# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

In [18]:
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "Age", "Fare", "Pclass", "Sex", "SibSp", "Parch", "Embarked")

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
rf_value = evaluator.evaluate(predictions)
print rf_value

In [20]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Create an initial RandomForest model.
layers = [4, 5, 4, 3]
mp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layers, blockSize=128, seed=1234)

# Train model with Training Data
mpModel = mp.fit(trainingData)

In [21]:
# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
mp_value = evaluator.evaluate(predictions)
print mp_value

In [23]:
#Comparing Models
print("Linear Regresion: {}".format(lr_value))
print("Decision Tree: {}".format(dt_value))
print("Random Forest: {}".format(rf_value))
print("Multilayer Perceptron: {}".format(mp_value))