In [None]:
'''

Context
Computer Network Traffic Data - A ~500K CSV with summary of some real network traffic data from the past. The dataset has ~21K rows and covers 10 local workstation IPs over a three month period. Half of these local IPs were compromised at some point during this period and became members of various botnets.

Content
Each row consists of four columns:

date: yyyy-mm-dd (from 2006-07-01 through 2006-09-30)
l_ipn: local IP (coded as an integer from 0-9)
r_asn: remote ASN (an integer which identifies the remote ISP)
f: flows (count of connnections for that day)
Reports of "odd" activity or suspicions about a machine's behavior triggered investigations on the following days (although the machine might have been compromised earlier)

Date : IP 08-24 : 1 09-04 : 5 09-18 : 4 09-26 : 3 6


'''

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import zscore

In [None]:
df = pd.read_csv('../input/cs448b_ipasn.csv')

In [None]:
df.head()

In [None]:
dfOrig = df.copy()
#df = dfOrig.copy()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
#vivewing given examples 

In [None]:
df[(df.l_ipn == 1) & (df.date == '2006-08-24')]


In [None]:
df[(df.l_ipn == 5) & (df.date == '2006-09-04')]

 

In [None]:
df[(df.l_ipn == 4) & (df.date == '2006-09-18')]

In [None]:
df[(df.l_ipn == 3) & (df.date == '2006-09-26')]

#EDA - Data Analysis

In [None]:
#removing f == 1
#df = df[df.f > 1]

In [None]:
#len(df)/len(dfOrig)

In [None]:
df.l_ipn.value_counts()

In [None]:
# 0 is he most active user, 3 is the least

In [None]:
for ip in set(df.l_ipn):
    fNormed = df.loc[(df.l_ipn == ip),'f']
    plt.boxplot(fNormed,len(fNormed) * [0],".")
    plt.title('IP:' + str(ip))
    plt.show()

In [None]:
for ip in set(df.l_ipn):
    df[df.l_ipn == ip].f.hist(bins = 100)
    plt.autoscale(enable=True, axis='both', tight=None)
    plt.title(('IP: %d') % ip)
    plt.show()

In [None]:
# instead use log scale since anomaly detection (have skewness of large values for "normal activity")

for ip in set(df.l_ipn):
    df[df.l_ipn == ip].f.hist(log=True,bins =200)
    plt.title(('IP: %d') % ip)
    plt.show()

In [None]:
#normalize flows per IP since different ratios and scales 

In [None]:
#sort by IP address
df.sort_values(inplace=True, by=['l_ipn'])

In [None]:
from sklearn.preprocessing import robust_scale
# Scale features using statistics that are robust to outliers.
# using robust_scale instead of RobustScaler since want to Standardize a dataset along any axis

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#accessing columns

#sample[sample['l_ipn'] == 0].fNorm = 2 # not good. returns a copy of sample.l_ipn

#use iloc or loc instead 
# .loc[criterion,selection]
#use df.iloc[1, df.columns.get_loc('s')] = 'B'    or use df.loc[df.index[1], 's'] = 'B'

#sample[sample.iloc[:,sample.columns.get_loc('l_ipn')] == 0]
#sample[sample.loc[:,'l_ipn'] == 0]

In [None]:
#normalize traffic for each IP 

#scaler = robust_scale()
scaler = StandardScaler()

In [None]:
for ip in set(df.l_ipn):
    df.loc[(df.l_ipn == ip),'fNorm'] = scaler.fit_transform(df.loc[(df.l_ipn == ip),'f'].values.reshape(-1, 1)) # reshaped since it's scaling a single feature only 
    df.loc[(df.l_ipn == ip),'fMean'] = scaler.mean_
    df.loc[(df.l_ipn == ip),'fVar'] = scaler.var_

In [None]:
for ip in set(df.l_ipn):
    fNormed = df.loc[(df.l_ipn == ip),'fNorm']
    plt.plot(fNormed,len(fNormed) * [0],".")
    fMean = df.loc[(df.l_ipn == ip),'fMean'].iloc[0]# only need the first value as they are all the same in this column for this ip
    plt.plot(fMean,0,'ro')
    plt.title('IP:' + str(ip))
    plt.show()

In [None]:
# it is clear that there are anomalies in the amount of traffic flow 

In [None]:
# todo: trying a scaler for skewed data

In [None]:
#analyzing ASN

In [None]:
#is asn unique per user?

In [None]:
listOfAsnsPerUser = [[]] * len(df.l_ipn)

In [None]:
numAsnsPerIp = 0
for ip in set(df.l_ipn):
    numAsnsPerIp += len(set(df.loc[(df.l_ipn == ip),'r_asn']))

In [None]:
numAsnsPerIp

In [None]:
len(set(df.loc[:,'r_asn']))

In [None]:
#number of  unique of asns per ip != number of unqiue asns for total dataset
#therefore, asns are not unique per IP

In [None]:
#using asns as categorical variable 

In [None]:
dfDummy = df.copy()

In [None]:
dfDummy = pd.get_dummies(df,columns=['r_asn'],drop_first=True)

In [None]:
dfDummy.head()

In [None]:
#takes too long 
# dfDummy.drop(labels =['date','fNorm','fMean','fVar','l_ipn'],axis=1).corr() 

In [None]:
dfCorrAsnFlow = dfDummy.drop(labels =['date','fNorm','fMean','fVar','l_ipn'],axis=1)

In [None]:
# todo: look for anomalies in users using suddently a different ASN and have a high traffic flow 

In [None]:
# time sampling

In [None]:
df.head()

In [None]:
df.date = pd.to_datetime(df.date,errors='coerce')

In [None]:
len(df.date) == len(df.date.dropna()) 

In [None]:
# all dates were valid 

In [None]:
df.info()

In [None]:
#df.date.hist(bins = 100)

In [None]:
#fig = plt.figure(figsize = (15,20))
#ax = fig.gca()
#df.date.hist(bins = 50, ax = ax)

In [None]:
df = df.sort_values('date', ascending=True)
plt.figure(figsize=(15,20))
plt.plot(df['date'], df['f'])
plt.xticks(rotation='vertical')

In [None]:
# now per ip 

In [None]:
for ip in set(df.l_ipn):
    plt.figure(figsize=(10,15))
    plt.xticks(rotation='vertical')
    plt.title(('IP: %d') % ip)
    plt.plot(df[df.l_ipn == ip]['date'], df[df.l_ipn == ip]['f'])
    plt.show()

In [None]:
#using IP = 4 as poc 

In [None]:
dataset = df.copy()

In [None]:
dataset = pd.get_dummies(dataset,columns=['r_asn'],drop_first=True)

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop(labels =['date','fNorm','fMean','fVar'],axis=1)

In [None]:
dataset.head()

# classification models

In [None]:
#using IP == 4 as POC

In [None]:
dataset = dataset[dataset.l_ipn == 4].drop(['l_ipn'],axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
dataset.loc[:,'f'] = scaler.fit_transform(dataset.f.values.reshape(-1, 1))

In [None]:
dataset

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(dataset)

In [None]:
kmeans.labels_

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
centroids = kmeans.cluster_centers_

In [None]:
centroids

In [None]:
centroids2d = pd.DataFrame(pca.fit_transform(centroids))

In [None]:
centroids2d

In [None]:
xPca = centroids2d.loc[:,0]
yPca = centroids2d.loc[:,1]

In [None]:
xPca

In [None]:
yPca

In [None]:
plt.scatter(xPca,yPca)

In [None]:
# will create a single dataset without any anomalies for IP = 4 

In [None]:
plt.figure(figsize=(10,15))
plt.xticks(rotation='vertical')
plt.yscale('log')
plt.title(('IP: %d') % 4)
plt.plot(range(len(dataset)), dataset['f'])
plt.show()

In [None]:
len(dataset[(dataset.f < 10**-.5)])

In [None]:
len(dataset)

In [None]:
#seems as those two days were anomalious

In [None]:
negativeClass = dataset[(dataset.f >= 10**-.5)]

In [None]:
negativeClass

In [None]:
positiveClass = dataset.head(2)

In [None]:
positiveClass

In [None]:
#removing test classes from dataset
dataset = dataset.drop(dataset.index[[0,1]])

In [None]:
len(dataset)

In [None]:
dataset = dataset[(dataset.f < 10**-.5)]

In [None]:
len(dataset)

In [None]:
kmeans.predict(positiveClass)

In [None]:
kmeans.predict(negativeClass)

In [None]:
posRes = kmeans.transform(positiveClass)

In [None]:
negRes = kmeans.transform(negativeClass)

In [None]:
negRes[0]

In [None]:
from numpy import linalg

In [None]:
centroids

In [None]:
dist = numpy.linalg.norm(a-b)

In [None]:
# todo checkout auto encoders

In [None]:
#todo checkout one class SVM

In [None]:
#create classifier and split data into train test and validation 