In [None]:
from pyspark import SparkContext
from pyspark import SQLContext
import os

# Use the Databricks CSV parser, this will automatically infer the schema.
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.4.0 pyspark-shell'
sc = SparkContext(appName="CERN Spark ML tutorial")
sqlContext = SQLContext(sc)

# Feature Engineering

In this part of the tutorial we will discuss the process of feature selection, - normalization, and - engineering using Apache Spark. In contrast to the previous parts, we will **not** be using a Decision Tree, or an ensemble of decision trees (like a Random Forest) since these require no to little data preperation (like normalization) because of the model intrisics.

However, other models, such as a Neural Network, profit from feature normalization since it will (most of the time) reduce the training time, and will not cause the hidden neurons to be saturated (and thus preventing numerical errors).

Of course, like before, we will apply the same basic steps:
1. Loading the dataset.
2. Vectorizing the features.
3. Transforming the string labels ("s" and "b") to indices.

In [None]:
# 1. Load the dataset.
dataset = sqlContext.read.format('com.databricks.spark.csv')\
                    .options(header='true', inferSchema='true').load("training.csv")
# Keep a copy of the original dataset for later use.
original_dataset = dataset

In [None]:
from pyspark.ml.feature import VectorAssembler

# 2. Vectorize the features into the "features" column.
features = dataset.columns
features.remove('EventId')
features.remove('Weight')
features.remove('Label')

assembler = VectorAssembler(inputCols=features, outputCol="features")
dataset = assembler.transform(dataset)

In [None]:
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.sql.functions import rowNumber

# 3. Transform the string labels to indices.
labelIndexer = StringIndexer(inputCol="Label", outputCol="label").fit(dataset)
dataset = labelIndexer.transform(dataset)

Split the dataset in a traing and test set for future model evaluation.

In [None]:
(trainingSet, testSet) = dataset.randomSplit([0.6, 0.4])

Now we transformed the original dataset in a format Apache Spark can understand, we will train a basic model which will act as a baseline. The metric we will apply to evaluate this model is the F1 score (https://en.wikipedia.org/wiki/F1_score). We would like to note that this metric is not the ideal metric for this dataset because of high noise in the dataset. However, other metrics like AUC (Area Under ROC curve: https://en.wikipedia.org/wiki/Receiver_operating_characteristic) are also not very suitable, as can be derrived from the documentation of the winning models.

**note**: Due to time limitations we will not be able to train a decent model. However, we will try to show that feature normalization and engineering can indeed improve the performance of your model. Please feel free to modify the model and training parameters.

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define the structure of the Neural Network.
numFeatures    = len(features)
numClasses     = 2 # s, b
# It is possibly to supply multiple hidden layers. E.g., [numFeatures, 10, 10, 10, numClasses]
layers         = [numFeatures, 10, numClasses]

# Define the learning algorithm (estimator).
trainer = MultilayerPerceptronClassifier(maxIter=5, blockSize=100000 ,layers=layers, seed=1234L)
# Train the Neural Network on the training set.
nnModel = trainer.fit(trainingSet)
nnResult = nnModel.transform(testSet)

# Evaluate the model performance using the F1 metric.
predictionAndLabels = nnResult.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1:" + str(evaluator.evaluate(predictionAndLabels)))

### Feature normalization



In [None]:
from pyspark.ml.feature import StandardScaler

# Normalize the features with zero mean and unit standard deviation.
standardScaler = StandardScaler(inputCol="features", outputCol="features_normalized", withStd=True, withMean=True)
standardScalerModel = standardScaler.fit(dataset)

normalized_dataset = standardScalerModel.transform(dataset)

Again, split the dataset into a training- and a testset.

In [None]:
(trainingSet, testSet) = normalized_dataset.randomSplit([0.6, 0.4])

Let us check if the normalized features improve our model performance! Note that our feature column has now been renamed to **features_normalized**.

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define the structure of the Neural Network.
numFeatures    = len(features)
numClasses     = 2 # s, b
layers         = [numFeatures, 10, numClasses]

# Define the learning algorithm (estimator).
trainer = MultilayerPerceptronClassifier(maxIter=5, blockSize=100000 ,layers=layers,\
                                         seed=1234L, featuresCol="features_normalized")
# Train the Neural Network on the training set.
nnModel = trainer.fit(trainingSet)
nnResult = nnModel.transform(testSet)

# Evaluate the model performance using the F1 metric.
predictionAndLabels = nnResult.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1:" + str(evaluator.evaluate(predictionAndLabels)))

**Non-normalized features:** F1:0.638296770979

**Normalized features:** F1:0.705950405435

This indeed proves the hypothesis stated above.

In [None]:
# Show the correlation matrix.
original_dataset.select(features).toPandas().corr()

From this we can deduce that some features are correlated (mostly derrived features with primitive features). Again, as with the normalization of features. Our experiments that about 8-11 features are correlated. As feature normalization might speed up the training process (getting closer to the global or local minima), so might reducing the dimensionality of the input features. One way to achieve is to apply feature selection, i.e., the removal of features. However, other techniques aim at reducing said dimensionality while still representing the same amount of information.

One of the techniques which accomplishes that behavior is called Principal Component Analysis or PCA (https://en.wikipedia.org/wiki/Principal_component_analysis). Intuitively, PCA tries to fit *n* eigenvalues to represent the data. These eigenvalues are than used as features for your machine learning model. However, PCA can also be used as a dimensionality reduction method to visualize your data. But that is outside the scope of this tutorial. More on PCA: https://www.quora.com/What-is-an-intuitive-explanation-for-PCA

In [None]:
from pyspark.ml.feature import PCA

# Define the number of desired principal components.
nPrincipalComponents = 19
# Define a PCA estimator and model.
pca = PCA(k=nPrincipalComponents, inputCol="features_normalized", outputCol="pca_features")
pcaModel = pca.fit(normalized_dataset)

pca_dataset = pcaModel.transform(normalized_dataset)

In [None]:
(trainingSet, testSet) = pca_dataset.randomSplit([0.6, 0.4])

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define the structure of the Neural Network.
numFeatures    = nPrincipalComponents
numClasses     = 2 # s, b
layers         = [numFeatures, 10, numClasses]

# Define the learning algorithm (estimator).
trainer = MultilayerPerceptronClassifier(maxIter=5, blockSize=100000 ,layers=layers,\
                                         seed=1234L, featuresCol="pca_features")
# Train the Neural Network on the training set.
nnModel = trainer.fit(trainingSet)
nnResult = nnModel.transform(testSet)

# Evaluate the model performance using the F1 metric.
predictionAndLabels = nnResult.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1:" + str(evaluator.evaluate(predictionAndLabels)))

As you can see there is only a very small increase in model performance. However, I would argue, this is mostly due to the small Neural Network model. The nice thing about these Neural Networks is that they will construct features in the hidden layer themselves. So any feature engineering attempts will not benifit the learning process a lot.