In [1]:
import pandas as pd
training_data = sqlContext.read.load('/FileStore/tables/train.csv',
                                  format='com.databricks.spark.csv',
                                  header='true',
                                  inferSchema='true')


training_data = training_data.na.drop(subset=['Age','Embarked'])
training_data = training_data.drop('Cabin')

pd.DataFrame(training_data.take(3), columns = training_data.columns)

training_data.describe().toPandas().transpose()



In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["Sex", "Embarked"]
stages = [] # stages in our Pipeline
for column in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndex = StringIndexer(inputCol=column, outputCol=column+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=column+"Index", outputCol=column+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndex, encoder]

  

In [3]:
label_stringIndex = StringIndexer(inputCol = "Survived", outputCol = "label")
stages += [label_stringIndex]

In [4]:
numericCols = ["Pclass","Age","SibSp","Parch","Fare"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [5]:
cols = training_data.columns
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(training_data)
training_data = pipelineModel.transform(training_data)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
training_data = training_data.select(selectedcols)
#display(dataset)
type(training_data)
training_data.toPandas()

In [6]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
(trainData, testData) = training_data.randomSplit([0.7, 0.3], seed = 100)
# Train model with Training Data
lrModel = lr.fit(trainData)
predictions = lrModel.transform(testData)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)
#evaluator.getMetricName()

In [7]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.01,0.01])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainData)


In [8]:
predictions = cvModel.transform(testData)


In [9]:
evaluator.evaluate(predictions)

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=9)

# Train model with Training Data
dtModel = dt.fit(trainData)

In [11]:
predictions = dtModel.transform(testData)

In [12]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)
print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

In [13]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1,2,6,10])
             .addGrid(dt.maxBins, [20,40,80])
             .build())

In [14]:
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainData)


In [15]:
predictions = cvModel.transform(testData)

In [16]:
evaluator.evaluate(predictions)

In [17]:
print "numNodes = ", cvModel.bestModel.numNodes
print "depth = ", cvModel.bestModel.depth

In [18]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainData)

In [19]:
predictions = rfModel.transform(testData)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [21]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

In [22]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(trainData)

In [23]:
predictions = cvModel.transform(testData)

In [24]:
evaluator.evaluate(predictions)

In [25]:
bestModel = cvModel.bestModel
# Generate predictions for entire dataset
finalPredictions = bestModel.transform(training_data)
# Evaluate best model
evaluator.evaluate(finalPredictions)

In [26]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Create an initial RandomForest model.
layers = [8, 5, 4, 2]
mp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=200, layers=layers, blockSize=128, seed=1234)

# Train model with Training Data
mpModel = mp.fit(trainData)

In [27]:
predictions = mpModel.transform(testData)

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate model
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)