In [1]:
import os
import sys
#iPython Notebook and Apache Spark path appending
os.environ['SPARK_HOME']="S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6"
sys.path.append("S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6/bin")
sys.path.append("S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6/python")
sys.path.append("S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6/python/pyspark")
sys.path.append("S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6/python/lib")
sys.path.append("S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6/python/lib/pyspark.zip")
sys.path.append("S:/spark/spark-1.5.2-bin-hadoop2.6/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip")
sys.path.append("C:/Program Files/Java/jre1.8.0_60/bin")

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import udf
import operator
SparkContext.setSystemProperty("hadoop.home.dir", "S:\\spark\\winutil\\") #For Windows OS only
conf = SparkConf().setAppName("AnomalyDetection")
sc = SparkContext(conf=conf)
sqlCt = SQLContext(sc)

#Goal is to detect an anomaly within the dataset

In [6]:
#Just a toy dataset
data = [(0, ["http", "udt", 0.4]), 
        (1, ["http", "udf", 0.5]), 
        (2, ["http", "tcp", 0.5]), 
        (3, ["ftp", "icmp", 0.1]), 
        (4, ["http", "tcp", 0.4])]
schema = ["id", "rawFeatures"]

In [7]:
df = sqlCt.createDataFrame(data, schema).cache()

In [8]:
df.show()

+---+----------------+
| id|     rawFeatures|
+---+----------------+
|  0|[http, udt, 0.4]|
|  1|[http, udf, 0.5]|
|  2|[http, tcp, 0.5]|
|  3|[ftp, icmp, 0.1]|
|  4|[http, tcp, 0.4]|
+---+----------------+



##First: Change the categorical features to one hot encoding

In [22]:
indices =[0,1]
dlist = []
for idx in indices:
    catUDF = udf(lambda x: x[idx], StringType())
    #for each column only collect the unique entries ex: Column 0 has http, ftp
    rows = df.select(catUDF(df.rawFeatures)).distinct().collect()
    d = {}
    for i, r in enumerate(rows):
        d[r[0]] = [0.0]*len(rows)  #list of size 2, creates [0.0, 0,0] for Column 0
        d[r[0]][i] = 1.0 #let's say http is the first value encountered then i is 0 at that point so it will be [1.0,0.0]
    dlist.append(d)

In [23]:
dlist

[{u'ftp': [1.0, 0.0], u'http': [0.0, 1.0]},
 {u'icmp': [0.0, 0.0, 1.0, 0.0],
  u'tcp': [0.0, 1.0, 0.0, 0.0],
  u'udf': [0.0, 0.0, 0.0, 1.0],
  u'udt': [1.0, 0.0, 0.0, 0.0]}]

##Second: Apply one hot encoding to each row

In [36]:
dlistBC = sc.broadcast(dlist)
def transform(features):
    newFeatures = []
    for i,idx in enumerate(indices):
        print i, idx
        newFeatures.extend(dlistBC.value[i][features[idx]])  #ex: dlistBC.value[0]['http'] add the one hot encoding in the list
    newFeatures.extend([float(x) for x in features[2:]]) #add the last numerical column
    return newFeatures
        
transformUDF = udf(transform, ArrayType(FloatType()))
df = df.withColumn("features", transformUDF(df.rawFeatures))

In [38]:
df.select("features").show()

+--------------------+
|            features|
+--------------------+
|[0.0, 1.0, 1.0, 0...|
|[0.0, 1.0, 0.0, 0...|
|[0.0, 1.0, 0.0, 1...|
|[1.0, 0.0, 0.0, 0...|
|[0.0, 1.0, 0.0, 1...|
+--------------------+



##Third: K-means clustering

In [41]:
features = df.select("features").map(lambda row: row[0]).cache()
model = KMeans.train(features, k=2, maxIterations=40, runs=10, initializationMode="random", seed=20)

In [42]:
modelBC = sc.broadcast(model)
clusterUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df = df.withColumn("cluster", clusterUDF(df.features)).cache()

In [43]:
df.show()

+---+----------------+--------------------+-------+
| id|     rawFeatures|            features|cluster|
+---+----------------+--------------------+-------+
|  0|[http, udt, 0.4]|[0.0, 1.0, 1.0, 0...|      0|
|  1|[http, udf, 0.5]|[0.0, 1.0, 0.0, 0...|      0|
|  2|[http, tcp, 0.5]|[0.0, 1.0, 0.0, 1...|      0|
|  3|[ftp, icmp, 0.1]|[1.0, 0.0, 0.0, 0...|      1|
|  4|[http, tcp, 0.4]|[0.0, 1.0, 0.0, 1...|      0|
+---+----------------+--------------------+-------+



## For bigger dataset with more clusters scoring system is required, like for example:
### score (X) = (sizeOfMaxCluster - sizeOfClusterX) / (sizeOfMaxCluster - sizeOfMinCluster)