In [1]:
%pylab

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [2]:
import pyspark

In [3]:
from pyspark.mllib.clustering import KMeans
from pyspark.mllib.feature import StandardScaler
print ("Successfully imported Spark Modules")

Successfully imported Spark Modules


In [4]:
from collections import OrderedDict
from numpy import array
from math import sqrt

In [5]:
def parse_interaction(line):
    """
    Parses a network data interaction.
    """
    line_split = line.split(",")
    clean_line_split = [line_split[0]]+line_split[4:-1]
    return (line_split[-1], array([float(x) for x in clean_line_split]))


In [6]:
def distance(a, b):
    """
    Calculates the euclidean distance between two numeric RDDs
    """
    return sqrt(
        a.zip(b)
        .map(lambda x: (x[0]-x[1]))
        .map(lambda x: x*x)
        .reduce(lambda a,b: a+b)
        )

In [7]:
def dist_to_centroid(datum, clusters):
    """
    Determines the distance of a point to its cluster centroid
    """
    cluster = clusters.predict(datum)
    centroid = clusters.centers[cluster]
    return sqrt(sum([x**2 for x in (centroid - datum)]))

In [8]:
def clustering_score(data, k):
    clusters = KMeans.train(data, k, maxIterations=10, runs=5, initializationMode="random")
    result = (k, clusters, data.map(lambda datum: dist_to_centroid(datum, clusters)).mean())
    print "Clustering score for k=%(k)d is %(score)f" % {"k": k, "score": result[2]}
    return result

In [9]:
max_k = 10;
data_file='/Users/v33/Dropbox/DataMiningCOSC526/2016/Datasets/KDD-NetworkAnomalies/kddcup.data_10_percent';

# load raw data
print "Loading RAW data..."
raw_data = sc.textFile(data_file)

Loading RAW data...


In [10]:
# count by all different labels and print them decreasingly
print "Counting all different labels"
labels = raw_data.map(lambda line: line.strip().split(",")[-1])
label_counts = labels.countByValue()
sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
for label, count in sorted_labels.items():
    print label, count

Counting all different labels
smurf. 280790
neptune. 107201
normal. 97278
back. 2203
satan. 1589
ipsweep. 1247
portsweep. 1040
warezclient. 1020
teardrop. 979
pod. 264
nmap. 231
guess_passwd. 53
buffer_overflow. 30
land. 21
warezmaster. 20
imap. 12
rootkit. 10
loadmodule. 9
ftp_write. 8
multihop. 7
phf. 4
perl. 3
spy. 2


In [11]:
# Prepare data for clustering input
# the data contains non-numeric features, we want to exclude them since
# k-means works with numeric features. These are the first three and the last
# column in each data row
print "Parsing dataset..."
parsed_data = raw_data.map(parse_interaction)
parsed_data_values = parsed_data.values().cache()

Parsing dataset...


In [12]:
# Evaluate values of k from 5 to 40
print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
scores = map(lambda k: clustering_score(parsed_data_values, k), range(10,max_k+1,10))

Calculating total in within cluster distance for different k values (10 to 10):


  "Support for runs is deprecated in 1.6.0. This param will have no effect in 1.7.0.")


Clustering score for k=10 is 782.674504
