In [1]:
# python 3

# 1. Spark & Python: Working with RDDs (I)

- DataSets :  KDD Cup 1999 ( http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html )

This is the data set used for The Third International Knowledge Discovery and Data Mining Tools Competition, which was held in conjunction with KDD-99 The Fifth International Conference on Knowledge Discovery and Data Mining. The competition task was to build a network intrusion detector, a predictive model capable of distinguishing between ``bad'' connections, called intrusions or attacks, and ``good'' normal connections. This database contains a standard set of data to be audited, which includes a wide variety of intrusions simulated in a military network environment.

## RDD Creation

### (1) Getting the data files

In [24]:
import urllib

f = urllib.request.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", 
                       "kddcup.data_10_percent.gz")

### (2) Creating a RDD from a file

In [2]:
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

- SparkContext.textFile은 압축된 파일을 직접 다룰 수 있음.

In [3]:
# data count

raw_data.count()

494021

In [4]:
# 처음 다섯 줄

raw_data.take(5)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']

### (3) filter

In [5]:
normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)

In [6]:
from time import time

start = time()

normal_count = normal_raw_data.count()

diff = time() - start

print('normal count : ', normal_count),
print('걸린 시간 :', round(diff,3), 'seconds')

normal count :  97278
걸린 시간 : 1.009 seconds


- filter의 결과만 남음

### (4) map
- 모든 요소에 특정 함수 적용 가능

In [9]:
csv_data = raw_data.map(lambda x: x.split(","))
start = time()
head_rows = csv_data.take(5)
diff = time() - start
print('걸린 시간 :', round(diff,3), 'seconds')
print(head_rows)

걸린 시간 : 0.138 seconds
[['0', 'tcp', 'http', 'SF', '181', '5450', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '9', '9', '1.00', '0.00', '0.11', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'], ['0', 'tcp', 'http', 'SF', '239', '486', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '19', '19', '1.00', '0.00', '0.05', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'], ['0', 'tcp', 'http', 'SF', '235', '1337', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '29', '29', '1.00', '0.00', '0.03', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'], ['0', 'tcp', 'http', 'SF', '219', '1337', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '6', '6', '0.00', '0.00', '0.0

In [8]:
# 100000
start = time()
head_rows = csv_data.take(100000)
diff = time() - start
print('걸린 시간 :', round(diff,3), 'seconds')

걸린 시간 : 2.212 seconds


** - Using map with predefined functions **

In [10]:
def parse_interaction(line):
    elems = line.split(",")
    # normal 위치
    tag = elems[41]
    return (tag, elems)

key_csv_data = raw_data.map(parse_interaction)
head_rows = key_csv_data.take(5)
print(head_rows[0])

('normal.', ['0', 'tcp', 'http', 'SF', '181', '5450', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '9', '9', '1.00', '0.00', '0.11', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'])


- map을 function과 함께 이용하면 action을 할 때 바로 적용 가능

### (5) collect
- RDD에 있는 모든 요소를 메모리로 올림

In [11]:
start = time()

all_raw_data = raw_data.collect()

diff = time() - start

print('걸린 시간 :', round(diff,3), 'seconds')

걸린 시간 : 4.562 seconds


In [12]:
type(all_raw_data)

list

In [17]:
all_raw_data[:10]

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,59,59,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,212,1940,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0.00,0.00,0.00,0.00,1.00,0.00,

In [13]:
type(raw_data)

pyspark.rdd.RDD

- 모든 spark 워커 노드들은 RDD의 조각들을 가지고 있는데, collect를 실행 시 모든 RDD 조각들을 호출(?)하고 하나로 합친다.

In [20]:
# get data from file
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

# parse into key-value pairs
key_csv_data = raw_data.map(parse_interaction)

# filter normal key interactions
normal_key_interactions = key_csv_data.filter(lambda x: x[0] == "normal.")

# collect all

start = time()
all_normal = normal_key_interactions.collect()
diff = time() - start
normal_count = len(all_normal)

print ('걸린 시간 : ', round(diff,3), 'seconds')
print ('normal 개수 : ', normal_count)

걸린 시간 :  4.706 seconds
normal 개수 :  97278


In [21]:
start = time()
normal_key_interactions.count()
diff = time() - start

print ('걸린 시간 : ', round(diff,3), 'seconds')

걸린 시간 :  2.566 seconds


In [22]:
start = time()
normal_count = len(all_normal)
diff = time() - start

print ('걸린 시간 : ', round(diff,3), 'seconds')

걸린 시간 :  0.0 seconds


- count를 사용해서 개수를 세는 것보다 collet후 개수를 세는 것이 훨씬 빠름
- 반복 연산이 필요할 때는 count를 반복해서 시행하는 것보다 collect후 len()을 사용하는 것이 더 나음.

## Sampling RDDs
- sample
- takeSample

### (1) sample
- transformation

parameters : withReplacement, fraction, seed

In [25]:
raw_data_sample = raw_data.sample(False, 0.1, 122)
sample_size = raw_data_sample.count()
total_size = raw_data.count()

print ('sample size :', sample_size),
print ('total size : ', total_size)

sample size : 49351
total size :  494021


- 추가 transformation의 일부로 사용될 때 유용함.
- aggregation과 key-value 페어 연산에는 더 강력함.
- MLlib를 이용할 때 특히 유용함.

In [22]:
# sample data에서 normal이 포함된 row의 개수를 세어보자.

# transformations to be applied
raw_data_sample_items = raw_data_sample.map(lambda x: x.split(","))
sample_normal_tags = raw_data_sample_items.filter(lambda x: "normal." in x)

# actions + time
start = time()
sample_normal_tags_count = sample_normal_tags.count()
diff = time() - start

sample_normal_ratio = sample_normal_tags_count / float(sample_size)

print ('normal이 포함된 row 비율 : ', round(sample_normal_ratio,3))
print ('걸린 시간 : ', round(diff,3), 'seconds')

normal이 포함된 row 비율 :  0.195
걸린 시간 :  1.281 seconds


In [23]:
# raw data에서 normal이 포함된 row의 개수를 세어보자.

# transformations to be applied
raw_data_items = raw_data.map(lambda x: x.split(","))
normal_tags = raw_data_items.filter(lambda x: "normal." in x)

# actions + time
start = time()
normal_tags_count = normal_tags.count()
diff = time() - start

normal_ratio = normal_tags_count / float(total_size)

print ('normal이 포함된 row 비율 : ', round(sample_normal_ratio,3))
print ('걸린 시간 : ', round(diff,3), 'seconds')

normal이 포함된 row 비율 :  0.195
걸린 시간 :  2.825 seconds


- raw data에서 수행시 시간이 더 걸림. 당연한 결과...

### (2) takeSample
- sample 과 비슷하지만 샘플사이즈를 비율이 아닌 크기로 정의할 수 있음.
- action

In [26]:
start = time()
raw_data_sample = raw_data.takeSample(False, 400000, 1234)
normal_data_sample = [x.split(",") for x in raw_data_sample if "normal." in x]
diff = time() - start

normal_sample_size = len(normal_data_sample)

normal_ratio = normal_sample_size / 400000.0

print ('normal이 포함된 row 비율 : ', round(normal_ratio,3))
print ('걸린 시간 : ', round(diff,3), 'seconds')

normal이 포함된 row 비율 :  0.197
걸린 시간 :  3.733 seconds


- sample transformation보다 시간이 더 걸린 이유는 샘플링 프로세스를 분산시켜서 수행했기 때문.
- filtering과 splitting을 싱글 노드에서 수행함. ?????

However, it took longer, even with a slightly smaller sample. The reason is that Spark just distributed the execution of the sampling process. The filtering and splitting of the results were done locally in a single node.

## Set operations on RDDs

- RDD에서 union과 intersection을 수행할 수 있음.
- RDD는 실제하는 데이터 셋이 아니므로 union을 수행시 중복 데이터를 제거하지 않음.


### (1) Getting attack interactions using substract

In [27]:
# filter noraml

normal_raw_data = raw_data.filter(lambda x: "normal." in x)

In [28]:
# filter "attact" by using subtract

attack_raw_data = raw_data.subtract(normal_raw_data)

In [28]:
# count all

start = time()
raw_data_count = raw_data.count()
diff = time() - start

print ('걸린 시간 : ', round(diff,3), 'seconds')

걸린 시간 :  1.22 seconds


In [29]:
# count normal

start = time()
normal_raw_data_count = normal_raw_data.count()
diff = time() - start

print ('걸린 시간 : ', round(diff,3), 'seconds')

걸린 시간 :  0.935 seconds


In [29]:
# count attacks

start = time()
attack_raw_data_count = attack_raw_data.count()
diff = time() - start

print ('걸린 시간 : ', round(diff,3), 'seconds')

걸린 시간 :  7.432 seconds


In [31]:
print('noraml data : ', normal_raw_data_count)
print('attact data : ', attack_raw_data_count)

noraml data :  97278
attact data :  396743


### (1) Protocol and service combinations using cartesian
- Cartesian product

In [31]:
csv_data = raw_data.map(lambda x: x.split(","))
protocols = csv_data.map(lambda x: x[1]).distinct()
protocols.collect()

['icmp', 'udp', 'tcp']

In [32]:
services = csv_data.map(lambda x: x[2]).distinct()
services.collect()

['finger',
 'http',
 'netbios_dgm',
 'name',
 'hostnames',
 'vmnet',
 'systat',
 'shell',
 'netbios_ssn',
 'urh_i',
 'pop_3',
 'ctf',
 'domain',
 'mtp',
 'remote_job',
 'exec',
 'supdup',
 'http_443',
 'sunrpc',
 'urp_i',
 'pop_2',
 'csnet_ns',
 'smtp',
 'whois',
 'ldap',
 'daytime',
 'imap4',
 'nntp',
 'klogin',
 'rje',
 'IRC',
 'link',
 'eco_i',
 'tftp_u',
 'iso_tsap',
 'uucp_path',
 'auth',
 'ecr_i',
 'other',
 'domain_u',
 'courier',
 'discard',
 'red_i',
 'tim_i',
 'time',
 'login',
 'ftp',
 'telnet',
 'ntp_u',
 'sql_net',
 'echo',
 'private',
 'gopher',
 'efs',
 'netbios_ns',
 'ftp_data',
 'nnsp',
 'ssh',
 'netstat',
 'uucp',
 'Z39_50',
 'kshell',
 'X11',
 'bgp',
 'pm_dump',
 'printer']

In [33]:
product = protocols.cartesian(services).collect()

print ('protocol, services 조합 개수 : ', len(product))

protocol, services 조합 개수 :  198
