In [2]:
import numpy as np                   # matrix math
import pandas as pd                  # file input
import matplotlib.pyplot as plt      # plot graph
import matplotlib.animation          # animation

In [4]:
# load file 
def load_dataset(filename):
    return np.loadtxt(filename)

#file name ==> durudataset.txt
# for each user, how many packets are sent per second and what's the size of a packet
#anomalies (DDOS attempts) will have lots of big packets sent in a short amount of time 

## Euclidian distance
$$
d(p,q) = d(q,p) = \sqrt {(q_1 - p_1)^2 + (q_2 - p_2)^2 + ...+ (q_n - p_n)^2}
             \\ = \sqrt {\sum_{i=1}^n (p_i-q_i)^2}
$$

In [5]:
def euclidian(a,b):
    return np.linalg.norm(a-b)

In [16]:
def kmeans(k, epsilon=0, distance='euclidian'):
    history_centroids = []                        # list to store past centroids
    if distance == 'euclidian':
        dist_method = euclidian
    dataset = load_dataset('durudataset.txt')     # load dataset
    num_instances, num_features = dataset.shape   # get rows and columns
    prototypes = dataset[np.random.randint(0,num_instances-1, size=k)] # define k centroids (choose clustres randomly)
    history_centroids.append(prototypes)          # set these to the list of past centroids (to view progress over time)
    prototypes_old = np.zeros(prototypes.shape)   # to keep track of centroid at every iteration
    belongs_to = np.zeros((num_instances, 1))     # to store clusters
    norm = dist_method(prototypes,prototypes_old)
    iteration = 0
    while norm > epsilon:
        iteration += 1
        norm = dist_method(prototypes, prototypes_old)
        for index_instance, instance in enumerate(dataset):
            # define a distance vector of size k
            dist_vec = np.zeros((k,1))
            # for each centroid 
            for index_prototype, prototype in enumerate(prototypes):
                # compute distance between x and centroid 
                dist_vec[index_prototype] = dist_method(prototype, instance)
            # find the least distance, assign that distance to a cluster
            belongs_to[index_instance,0] = np.argmin(dist_vec)
            
        # tmp prototype to later save into history
        tmp_prototypes = np.zeros((k, num_features))
        
        # for each cluster, k 
        for index in range(len(prototypes)):
            # get all data points assigned to a cluster
            instances_close = [i for i in range(len(belongs_to)) if belongs_to[i] == index]
            # find the mean of those points
            prototype = np.mean(dataset[instances_close], axis=0)
            # add new centroid to the tmp_prototype list (this is the new centroid )
            tmp_prototypes[index, :] = prototype
        
        # set the new list to the current list
        prototype = tmp_prototypes
        
        # add the calculated centroid to the history for plotting
        history_centroids.append(tmp_prototypes)
        
    #return calculated centroid, history of them, and assignments for which each data point belongs to
    return prototypes, history_centroids, belongs_to

In [17]:
def plot(dataset, history_centroids, belongs_to):
    # assume there are 2 clusters
    colors = ['r', 'k']
    
    # split graph by its axis and actual plot
    fig, ax = plt.subplots()
    
    # for each poin tin the dataset
    for index in range(dataset.shape[0]):
        # get all points assigned to a cluster
        instances_close = [i for i in range(len(belongs_to)) if belongs_to[i] == index]
        # assign each data point to that cluster a color and plot
        for instance_index in instances_close:
            ax.plot(dataset[instance_index][0], dataset[instance_index][1], (color[index] + 'o'))
            
    
     #lets also log the history of centroids calculated via training
    history_points = []
    #for each centroid ever calculated
    for index, centroids in enumerate(history_centroids):
        #print them all out
        for inner, item in enumerate(centroids):
            if index == 0:
                history_points.append(ax.plot(item[0], item[1], 'bo')[0])
            else:
                history_points[inner].set_data(item[0], item[1])
                print("centroids {} {}".format(index, item))

                plt.show()


In [18]:
# main file
def execute():
    dataset = load_dataset('durudataset.txt')
    centroids, history_centroids, belongs_to = kmeans(2)        # train model
    plot(dataset, history_centroids, belongs_to)

In [None]:
%matplotlib notebook

execute()