In [1]:
import pyspark as spark
import datetime, time
import pandas as pd
import networkx as nx
from functools import reduce
from pyspark.sql.types import (StructField, StringType,FloatType, 
                               DoubleType, IntegerType, StructType,
                              DateType)
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as spDataFrame
from pyspark.ml.feature import Binarizer, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression, LinearSVC, LinearSVCModel
from pyspark.ml.evaluation import HasFeaturesCol
from pyspark.ml.tuning import CrossValidator
import os
import matplotlib.pyplot as plt
#from tokenizer import Tokenizer

In [2]:
def unionAll(*dfs):
    return reduce(spDataFrame.unionAll, dfs)

This is to setup the sparksession. If additional resources need to be allocated it is done with this command

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName('Capstone Analysis') \
        .config("spark.driver.memory", "45g") \
        .config("spark.driver.cores", "10") \
        .config("spark.dirver.maxResultSize", "8g") \
        .config("spark.executor.memory", "6g") \
        .config("spark.files.maxPartitionBytes", "536870912") \
        .getOrCreate()

In [4]:
spark.sparkContext.uiWebUrl

'http://genuse50.engr.smu.edu:4040'

## Import DATA

In [5]:
proc_data_schema = [StructField('time',IntegerType(),True),
                   StructField('user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('proc_name',StringType(),True),
                   StructField('strt',StringType())]
proc_final_struc = StructType(fields = proc_data_schema)
proc = spark.read.csv('../Dataset/proc.txt',schema=proc_final_struc)

auth_data_schema = [StructField('time',IntegerType(),True),
                   StructField('src_user@domain',StringType(),True),
                   StructField('dest_user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('dest_comp',StringType(),True),
                   StructField('auth_type',StringType(),True),
                   StructField('logon_type',StringType(),True),
                   StructField('auth_orient',StringType(),True),
                   StructField('success',StringType(),True)]
auth_final_struc = StructType(fields = auth_data_schema)
auth = spark.read.csv('../Dataset/auth.txt',schema=auth_final_struc )

flows_data_schema = [StructField('time',IntegerType(),False),
                   StructField('dur',IntegerType(),False),
                   StructField('src_comp',StringType(),False),
                   StructField('src_port',StringType(),False),
                   StructField('dest_comp',StringType(),False),
                   StructField('dest_port',StringType(),False),
                   StructField('protocol',StringType(),False),
                   StructField('pkt_cnt',IntegerType(),False),
                   StructField('byte_cnt',IntegerType(),False)]
flows_final_struc = StructType(fields = flows_data_schema)
flows = spark.read.csv('../Dataset/flows.txt',schema=flows_final_struc )

dns_data_schema = [StructField('time',IntegerType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('cmp_resolved',StringType(),True)]
dns_final_struc = StructType(fields = dns_data_schema)
dns = spark.read.csv('../Dataset/dns.txt',schema=dns_final_struc)

redteam_data_schema = [StructField('time',IntegerType(),True),
                   StructField('user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('dest_comp',StringType(),True)]
redteam_final_struc = StructType(fields = redteam_data_schema)
redteam = spark.read.csv('../Dataset/redteam.txt',schema=redteam_final_struc)

In [6]:
dns.show(n=5,truncate=True)

+----+--------+------------+
|time|src_comp|cmp_resolved|
+----+--------+------------+
|   2|   C4653|       C5030|
|   2|   C5782|      C16712|
|   6|   C1191|        C419|
|  15|   C3380|      C22841|
|  18|   C2436|       C5030|
+----+--------+------------+
only showing top 5 rows



This code is supposed to map the DNS connections into a network map

In [7]:
# dns_graph = new_df.toPandas()

In [8]:
# new_dns_graph = dns_graph.sample(5000)
# G = nx.from_pandas_edgelist(new_dns_graph, 'src_comp','cmp_resolved','count')

In [9]:
# from operator import itemgetter
# node_and_degree = G.degree()
# (largest_hub, degree) = sorted(node_and_degree, key=itemgetter(1))[-1]
# hub_ego = nx.ego_graph(G, largest_hub)
# # Draw graph
# pos = nx.spring_layout(hub_ego)


In [10]:
# nx.draw(G,pos,node_color='b', node_size=50, with_labels=False)
# nx.draw_networkx_nodes(hub_ego, pos, nodelist=[largest_hub], node_size=300, node_color='r')
# plt.show()

## Transform Data

This will be where we split all user@domain columns into user and domain columns

In [7]:
proc_split = F.split(proc['user@domain'],'@')
proc = proc.withColumn('src_user',proc_split.getItem(0))
proc = proc.withColumn('src_dmn',proc_split.getItem(1))
proc = proc.drop('user@domain')

proc = proc.withColumn('type',F.lit('Process'))

In [8]:
auth_src_split = F.split(auth['src_user@domain'],'@')
auth = auth.withColumn('src_user',auth_src_split.getItem(0))
auth = auth.withColumn('src_dmn',auth_src_split.getItem(1))

auth_dest_split = F.split(auth['dest_user@domain'],'@')
auth = auth.withColumn('dest_user',auth_dest_split.getItem(0))
auth = auth.withColumn('dest_dmn',auth_dest_split.getItem(1))

auth = auth.drop('src_user@domain','dest_user@domain')

auth = auth.withColumn('type',F.lit('Auth'))

In [9]:
redteam_split = F.split(redteam['user@domain'],'@')
redteam = redteam.withColumn('src_user',redteam_split.getItem(0))
redteam = redteam.withColumn('src_dmn',redteam_split.getItem(1))

redteam = redteam.drop('user@domain')

redteam = redteam.withColumn('type',F.lit('RedTeam'))

In [10]:

flows = flows.withColumn('avg_pkt_size', (flows['byte_cnt']/flows['pkt_cnt']).cast(DoubleType()))
flows = flows.na.drop(how='all')

flows = flows.withColumn('type',F.lit('DataFlow'))

# Test/Train split

In [11]:
split_days = 29
split_range = split_days * 3600 * 24

In [12]:
#Split data into train/test segments. This will be done on the first number of days

redteam = redteam.filter(redteam.time <= split_range)

dns = dns.filter(dns.time <= split_range)

proc = proc.filter(proc.time <= split_range)

flows = flows.filter(flows.time <= split_range)

auth = auth.filter(auth.time <= split_range)

In [13]:
days = 20
train_range = days * 3600 * 24

In [14]:
#Split data into train/test segments. This will be done on the first number of days

redteam_test = redteam.filter(redteam.time > train_range)
redteam_train = redteam.filter(redteam.time <= train_range)

dns_test = dns.filter(dns.time > train_range)
dns_train = dns.filter(dns.time <= train_range)

proc_test = proc.filter(proc.time > train_range)
proc_train = proc.filter(proc.time <= train_range)

flows_test = flows.filter(flows.time > train_range)
flows_train = flows.filter(flows.time <= train_range)

auth_test = auth.filter(auth.time > train_range)
auth_train = auth.filter(auth.time <= train_range)

In [19]:
# colum = proc.columns
# colum.sort()

# proc = proc.select(colum)
# redteam = redteam.select(colum)
# auth = auth.select(colum)
# flows = flows.select(colum)

In [20]:
# master = unionAll(redteam,auth,proc,flows)

In [21]:
# print(master.count())

In [22]:
# master.select('strt').sort('strt').show()

This code was supposed to transform the time column from an int into a datetime data type

In [23]:
# redteam1 = redteam.select(F.to_date(redteam.time,'MM-dd HH:mm:ss').alias('date')).collect()
#redteam1 = redteam.rdd.map(lambda x: (x['time'], time.strftime('%m/%d %H:%M:%S', time.gmtime(x['time']) ))).toDF(['time','timestam'])

# Proc Data

In [15]:
proc_train.show(5)

+----+--------+---------+-----+--------+-------+-------+
|time|src_comp|proc_name| strt|src_user|src_dmn|   type|
+----+--------+---------+-----+--------+-------+-------+
|   1|      C1|      P16|Start|     C1$|   DOM1|Process|
|   1|   C1001|       P4|Start|  C1001$|   DOM1|Process|
|   1|   C1002|       P4|Start|  C1002$|   DOM1|Process|
|   1|   C1004|       P4|Start|  C1004$|   DOM1|Process|
|   1|   C1017|       P4|Start|  C1017$|   DOM1|Process|
+----+--------+---------+-----+--------+-------+-------+
only showing top 5 rows



In [16]:
proc_train.show(5)

+----+--------+---------+-----+--------+-------+-------+
|time|src_comp|proc_name| strt|src_user|src_dmn|   type|
+----+--------+---------+-----+--------+-------+-------+
|   1|      C1|      P16|Start|     C1$|   DOM1|Process|
|   1|   C1001|       P4|Start|  C1001$|   DOM1|Process|
|   1|   C1002|       P4|Start|  C1002$|   DOM1|Process|
|   1|   C1004|       P4|Start|  C1004$|   DOM1|Process|
|   1|   C1017|       P4|Start|  C1017$|   DOM1|Process|
+----+--------+---------+-----+--------+-------+-------+
only showing top 5 rows



## Auth Data

In [17]:
auth_train.show(5)

+----+--------+---------+---------+----------+-----------+-------+---------------+-------+---------------+--------+----+
|time|src_comp|dest_comp|auth_type|logon_type|auth_orient|success|       src_user|src_dmn|      dest_user|dest_dmn|type|
+----+--------+---------+---------+----------+-----------+-------+---------------+-------+---------------+--------+----+
|   1|   C1250|     C586|     NTLM|   Network|      LogOn|Success|ANONYMOUS LOGON|   C586|ANONYMOUS LOGON|    C586|Auth|
|   1|    C586|     C586|        ?|   Network|     LogOff|Success|ANONYMOUS LOGON|   C586|ANONYMOUS LOGON|    C586|Auth|
|   1|    C988|     C988|        ?|   Network|     LogOff|Success|          C101$|   DOM1|          C101$|    DOM1|Auth|
|   1|   C1020|    C1020|Negotiate|   Service|      LogOn|Success|         C1020$|   DOM1|         SYSTEM|   C1020|Auth|
|   1|   C1021|     C625| Kerberos|   Network|      LogOn|Success|         C1021$|   DOM1|         C1021$|    DOM1|Auth|
+----+--------+---------+-------

In [18]:
failed_logon = auth_train.filter(auth_train.success=='Fail').groupby(auth_train.time,auth_train.src_comp).count().na.fill(0).sort('time')
failed_logon = failed_logon.select(F.col('time'),F.col('src_comp'),F.col('count').alias('fail_count'))

In [19]:
failed_logon.show(5)

+----+--------+----------+
|time|src_comp|fail_count|
+----+--------+----------+
|   1|   C2742|         1|
|   1|   C1730|         1|
|   1|   C2800|         1|
|   1|   C1654|         1|
|   1|    C457|         1|
+----+--------+----------+
only showing top 5 rows



In [28]:

#master.groupBy('time').count().show()

# failed_user_logon = auth_train.filter(auth_train.success=='Fail').groupby(auth_train.time,auth_train.src_user,auth_train.auth_type,auth_train.logon_type).count().na.fill(0).sort('time')
# failed_user_logon = failed_user_logon.select(F.col('time'),F.col('src_user'),F.col('auth_type'),F.col('logon_type'),F.col('count').alias('fail_count'))

In [29]:
# tgt_type =  auth.filter(auth.auth_orient=='TGT').groupby(auth.time).count().na.fill(0).sort('time')
# tgs_type =  auth.filter(auth.auth_orient=='TGS').groupby(auth.time).count().na.fill(0).sort('time')

In [30]:
# process_start = proc.groupby(proc.time).agg(F.when)
# process_start = process_start.a
# agg().sort('time')
# #process_start = process_start.select(F.col('time'),F.col('strt'),F.col('count').alias('proc_change'))

In [31]:
auth_train.show(5)

+----+--------+---------+---------+----------+-----------+-------+---------------+-------+---------------+--------+----+
|time|src_comp|dest_comp|auth_type|logon_type|auth_orient|success|       src_user|src_dmn|      dest_user|dest_dmn|type|
+----+--------+---------+---------+----------+-----------+-------+---------------+-------+---------------+--------+----+
|   1|   C1250|     C586|     NTLM|   Network|      LogOn|Success|ANONYMOUS LOGON|   C586|ANONYMOUS LOGON|    C586|Auth|
|   1|    C586|     C586|        ?|   Network|     LogOff|Success|ANONYMOUS LOGON|   C586|ANONYMOUS LOGON|    C586|Auth|
|   1|    C988|     C988|        ?|   Network|     LogOff|Success|          C101$|   DOM1|          C101$|    DOM1|Auth|
|   1|   C1020|    C1020|Negotiate|   Service|      LogOn|Success|         C1020$|   DOM1|         SYSTEM|   C1020|Auth|
|   1|   C1021|     C625| Kerberos|   Network|      LogOn|Success|         C1021$|   DOM1|         C1021$|    DOM1|Auth|
+----+--------+---------+-------

# Redteam Data

In [32]:
redteam_train.show(5)

+------+--------+---------+--------+-------+-------+
|  time|src_comp|dest_comp|src_user|src_dmn|   type|
+------+--------+---------+--------+-------+-------+
|150885|  C17693|    C1003|    U620|   DOM1|RedTeam|
|151036|  C17693|     C305|    U748|   DOM1|RedTeam|
|151648|  C17693|     C728|    U748|   DOM1|RedTeam|
|151993|  C17693|    C1173|   U6115|   DOM1|RedTeam|
|153792|  C17693|     C294|    U636|   DOM1|RedTeam|
+------+--------+---------+--------+-------+-------+
only showing top 5 rows



In [22]:
redteam_event = redteam_train.groupby(redteam_train.time, redteam_train.src_comp).count().na.fill(0).sort('time')
redteam_event = redteam_event.select(F.col('time'),F.col('src_comp'),F.col('count').alias('redteam_event'))
# redteam_event = redteam_event.withColumn('redteam_event',\
#                                         F.when(redteam_event['redteam_event']>1,2).otherwise(redteam_event['redteam_event']))

In [62]:
redteam_event.show(5)

+------+--------+-------------+
|  time|src_comp|redteam_event|
+------+--------+-------------+
|150885|  C17693|            1|
|151036|  C17693|            1|
|151648|  C17693|            1|
|151993|  C17693|            1|
|153792|  C17693|            1|
+------+--------+-------------+
only showing top 5 rows



# Flows Data

In [15]:
flows_train.show(5)

+----+---+--------+--------+---------+---------+--------+-------+--------+------------+--------+
|time|dur|src_comp|src_port|dest_comp|dest_port|protocol|pkt_cnt|byte_cnt|avg_pkt_size|    type|
+----+---+--------+--------+---------+---------+--------+-------+--------+------------+--------+
|   1|  0|   C1065|     389|    C3799|   N10451|       6|     10|    5323|       532.3|DataFlow|
|   1|  0|   C1423|   N1136|    C1707|       N1|       6|      5|     847|       169.4|DataFlow|
|   1|  0|   C1423|   N1142|    C1707|       N1|       6|      5|     847|       169.4|DataFlow|
|   1|  0|  C14909|   N8191|    C5720|     2049|       6|      1|      52|        52.0|DataFlow|
|   1|  0|  C14909|   N8192|    C5720|     2049|       6|      1|      52|        52.0|DataFlow|
+----+---+--------+--------+---------+---------+--------+-------+--------+------------+--------+
only showing top 5 rows



In [21]:
flows_data = flows_train.select('time','dur','src_comp','pkt_cnt','byte_cnt','avg_pkt_size')\
                        .groupby('time','src_comp').agg(F.sum('dur').alias('dur'),F.sum('pkt_cnt').alias('pkt_cnt'),F.sum('byte_cnt').alias('byte_cnt'),F.avg('avg_pkt_size').alias('avg_pkt_size')).sort('time')

In [20]:
# flows_data.show(10)

In [35]:
flows_train.show(5)

+----+---+--------+--------+---------+---------+--------+-------+--------+------------+--------+
|time|dur|src_comp|src_port|dest_comp|dest_port|protocol|pkt_cnt|byte_cnt|avg_pkt_size|    type|
+----+---+--------+--------+---------+---------+--------+-------+--------+------------+--------+
|   1|  0|   C1065|     389|    C3799|   N10451|       6|     10|    5323|       532.3|DataFlow|
|   1|  0|   C1423|   N1136|    C1707|       N1|       6|      5|     847|       169.4|DataFlow|
|   1|  0|   C1423|   N1142|    C1707|       N1|       6|      5|     847|       169.4|DataFlow|
|   1|  0|  C14909|   N8191|    C5720|     2049|       6|      1|      52|        52.0|DataFlow|
|   1|  0|  C14909|   N8192|    C5720|     2049|       6|      1|      52|        52.0|DataFlow|
+----+---+--------+--------+---------+---------+--------+-------+--------+------------+--------+
only showing top 5 rows



# Combine Data

In [36]:
# stringIndexer = StringIndexer(inputCol='dest_comp', outputCol='successType')
# model = stringIndexer.fit(redteam)
# indexed = model.transform(redteam)

# encoder = OneHotEncoder(inputCol='successType', outputCol='successVec')
# encoded = encoder.transform(indexed)
# encoded.show()



This section I tried transforming the dataframes into pandas dataframes. This works, sort of, but it is very slow.

In [37]:
# redteam_pd_train = redteam_train.toPandas()
# redteam_pd_train['time'] = pd.to_datetime(redteam_pd_train['time'],unit='s')

In [38]:
# # failed_logon_pd_train = failed_logon.toPandas()

# # failed_user_logon_pd_train = failed_user_logon.toPandas()

# failed_user_logon_pd_train['time'] = pd.to_datetime(failed_user_logon_pd_train['time'],unit='s')

# failed_user_logon_pd_train_resamp = failed_user_logon_pd_train.groupby('src_user').apply(lambda x: x.set_index('time').resample('1Min').sum()).reset_index()

# redteam_pd_user_train = redteam_pd_train.groupby('src_user').apply(lambda x: x.set_index('time').resample('1Min').sum()).reset_index()

# combined_file = pd.merge(redteam_pd_train, failed_logon_pd_train, on=['time','src_comp'], how='outer')

# inner_join = pd.merge(redteam_pd_train, failed_logon_pd_train, on=['time','src_comp'], how='inner')

# inner_join.shape

# inner_user_join = pd.merge(redteam_pd_train, failed_user_logon_pd_train, on=['time','src_user'], how='inner')

# inner_user_join.shape

# failed_logon_pd_train.shape

# redteam_pd_train.shape

In [18]:
batch_size = 300
batches = train_range/batch_size
switch = 0
for i in range(int(2)):
    #Here we are breaking the data up into seperate segments
    subset_logon = failed_logon.filter((failed_logon.time<=(batch_size*i)) & (failed_logon.time>(batch_size*(i-1))))
    subset_redteam = redteam_event.filter((redteam_event.time<=(batch_size*i)) & (redteam_event.time>(batch_size*(i-1))))
    subset_flows_data = flows_data.filter((flows_data.time<=(batch_size*i)) & (flows_data.time>(batch_size*(i-1))))
    

    #Here we join the data
    temp_subset = subset_redteam.join(subset_logon,['time','src_comp'],'outer')
    temp_subset = temp_subset.join(subset_flows_data,['time','src_comp'],'outer')
    temp_subset = temp_subset.na.fill(0).sort('time')
    
    temp_subset = temp_subset.groupby('src_comp').agg(F.min('time').alias('time')\
                                                     ,F.sum('redteam_event').alias('redteam_event')\
                                                     ,F.sum('dur').alias('dur')\
                                                     ,F.sum('fail_count').alias('fail_count')\
                                                     ,F.sum('pkt_cnt').alias('pkt_cnt')\
                                                     ,F.sum('byte_cnt').alias('byte_cnt')\
                                                     ,F.avg('avg_pkt_size').alias('avg_pkt_size'))\
                                                .sort('time','src_comp')
    
    temp_subset = temp_subset.withColumn('redteam_event',\
                                        F.when(temp_subset['redteam_event']>1,2).otherwise(temp_subset['redteam_event']))
    
    if switch == 0:
        master_subset = temp_subset
        switch = 1
    else:
        master_subset = master_subset.union(temp_subset)
        

In [83]:
# master_event_subset = subset_redteam.join(subset_logon,(subset_redteam.time==subset_logon.time)&(subset_redteam.src_comp==subset_logon.src_comp),
#                                   'right').na.fill(0)

In [19]:
master_subset.show()

+--------+----+-------------+----+----------+-------+---------+------------------+
|src_comp|time|redteam_event| dur|fail_count|pkt_cnt| byte_cnt|      avg_pkt_size|
+--------+----+-------------+----+----------+-------+---------+------------------+
|   C1065|   1|            0| 815|         4|   9273|  1963507| 205.1584844792757|
|   C1423|   1|            0|   0|         0|     12|     1786| 87.13333333333333|
|  C14909|   1|            0|   0|         0|      3|      156|              52.0|
|   C1654|   1|            0|  10|         2|     14|     4356|178.42857142857142|
|   C1692|   1|            0|   0|         7|      0|        0|               0.0|
|   C1707|   1|            0|1004|         0| 530280|751017952| 443.2850705018127|
|   C1730|   1|            0|   0|         1|      0|        0|               0.0|
|   C1799|   1|            0|  15|         0|      6|     1222|203.66666666666666|
|   C1846|   1|            0|   0|         2|      0|        0|               0.0|
|   

# Model creation

In [20]:
X = master_subset.select([c for c in master_subset.columns if c not in {'time','src_comp','redteam_event'}])
y = master_subset.select('redteam_event')

In [107]:
X.show()

+----+----------+-------+---------+------------------+
| dur|fail_count|pkt_cnt| byte_cnt|      avg_pkt_size|
+----+----------+-------+---------+------------------+
| 815|         4|   9273|  1963507| 205.1584844792757|
|   0|         0|     12|     1786| 87.13333333333333|
|   0|         0|      3|      156|              52.0|
|  10|         2|     14|     4356|178.42857142857142|
|   0|         7|      0|        0|               0.0|
|1004|         0| 530280|751017952| 443.2850705018127|
|   0|         1|      0|        0|               0.0|
|  15|         0|      6|     1222|203.66666666666666|
|   0|         2|      0|        0|               0.0|
|   0|         2|      0|        0|               0.0|
|   0|         1|      0|        0|               0.0|
|  54|         0|     18|      864|              48.0|
| 125|         0|     42|     2016|              48.0|
|  80|         0|     39|     6871|147.92857142857142|
|  11|         0|     30|     7434|207.58854166666669|
| 309|    

## Data Analysis

In [5]:
# proc_domains = proc.select('domain').distinct()
# proc_users = proc.select('user').distinct()

In [13]:
# proc_domains.coalesce(1).write.csv('domains.csv')
# proc_users.coalesce(1).write.csv('users.csv')

In [60]:
# proc.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in proc.columns)).show()

+----+-----------+---------+-----+
|time|user@domain|proc_name|start|
+----+-----------+---------+-----+
|   0|          0|        0|    0|
+----+-----------+---------+-----+



In [58]:
# flows.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in flows.columns)).show()

+----+--------+--------+-------+---------+--------+--------+-------+-------+
|time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|
+----+--------+--------+-------+---------+--------+--------+-------+-------+
| 530|     530|     530|    530|      530|     530|     530|    530|    530|
+----+--------+--------+-------+---------+--------+--------+-------+-------+



In [77]:
auth.select('success').distinct().show()

+-------+
|success|
+-------+
|Success|
|   Fail|
+-------+



In [59]:
auth.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in auth.columns)).show()

+----+---------------+----------------+--------+---------+---------+----------+----------------+-------+
|time|src_user@domain|dest_user@domain|src_comp|dest_comp|auth_type|logon_type|auth_orientation|success|
+----+---------------+----------------+--------+---------+---------+----------+----------------+-------+
|   0|              0|               0|       0|        0|        0|         0|               0|      0|
+----+---------------+----------------+--------+---------+---------+----------+----------------+-------+



In [32]:
flows.orderBy("avg_pkt_size").show()

+-----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
| time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|avg_pkt_size|
+-----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|41972|       0|   C1654|     80|   C13742|   N4427|       6|      2|     92|        46.0|
|41974|      38|   C1015|   N221|    C8681|   N2153|       6|      2|     92|        46.0|
|41972|       0|   C7632|   N294|   C20510|  N18962|       6|      1|     46|        46.0|
|41970|       0|   C8974|  N4126|    C5787|  N30556|       6|      1|     46|        46.0|
|41972|      38|   C1015|   N221|    C8964|   N2024|       6|      2|     92|        46.0|
|41971|       0|  C14402|  N9113|     C585|     139|       6|      1|     46|        46.0|
|41972|      60|  C11149|  N2801|    C2588|     N76|       6|      4|    184|        46.0|
|41971|       0|   C3873|     80|    C3959|   N3771|       6|      2|     92|        46.0|

In [49]:
flows.select([count(when(isnan(c)|col(c).isNull(), c)).alias(c) for c in flows.columns]).show()

+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|avg_pkt_size|
+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|   0|       0|       0|      0|        0|       0|       0|      0|      0|           0|
+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+



In [54]:
dns.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in dns.columns)).show()

+----+--------+------------+
|time|src_comp|cmp_resolved|
+----+--------+------------+
|   0|       0|           0|
+----+--------+------------+



In [10]:
proc.show()

+----+-----------+-----+---------+-----+
|time|user@domain| comp|proc_name|start|
+----+-----------+-----+---------+-----+
|   1|   C1$@DOM1|   C1|      P16|Start|
|   1|C1001$@DOM1|C1001|       P4|Start|
|   1|C1002$@DOM1|C1002|       P4|Start|
|   1|C1004$@DOM1|C1004|       P4|Start|
|   1|C1017$@DOM1|C1017|       P4|Start|
|   1|C1018$@DOM1|C1018|       P4|Start|
|   1|C1020$@DOM1|C1020|       P3|Start|
|   1|C1020$@DOM1|C1020|       P4|Start|
|   1|C1028$@DOM1|C1028|      P16|  End|
|   1|C1029$@DOM1|C1029|       P4|Start|
|   1|C1030$@DOM1|C1030|       P4|Start|
|   1|C1032$@DOM1|C1032|       P4|Start|
|   1|C1035$@DOM1|C1035|      P37|Start|
|   1|C1035$@DOM1|C1035|       P5|Start|
|   1|C1051$@DOM1|C1051|      P16|Start|
|   1|C1069$@DOM1|C1069|       P3|Start|
|   1|C1069$@DOM1|C1069|       P4|Start|
|   1|C1079$@DOM1|C1079|       P4|Start|
|   1|C1084$@DOM1|C1084|       P4|Start|
|   1|C1088$@DOM1|C1088|       P4|Start|
+----+-----------+-----+---------+-----+
only showing top

In [20]:
redteam.printSchema()

root
 |-- time: date (nullable = true)
 |-- user@domain: string (nullable = true)
 |-- src_comp: string (nullable = true)
 |-- dst_comp: string (nullable = true)



In [6]:
from pyspark.sql.functions import lit, unix_timestamp
start = datetime.date(2018,1,1)
#datetime.timestamp(2018,1,1,12,0,0)



In [7]:
datetime.datetime.fromtimestamp(time.mktime(start.timetuple()) + 228150)

datetime.datetime(2018, 1, 3, 15, 22, 30)

In [8]:
func = udf (lambda x: datetime.datetime.fromtimestamp(time.mktime(start.timetuple()) + x).date(),DateType() )

In [9]:
redteam1 = redteam.withColumn("timestam", redteam.select("time")),'yyyy-MM-dd HH:mm:ss').cast("timestamp") )

SyntaxError: invalid syntax (<ipython-input-9-48ac7615f935>, line 1)

In [10]:
import time

time_update = udf(lambda x: time.strftime('%m/%d %H:%M:%S', time.gmtime(x)))

#timestam = time.strftime('%m/%d %H:%M:%S', time.gmtime(redteam.select("time")))

# Create Classification model

In [24]:
cv = StratifiedKFold(n_splits=6)
classifier = xgb.XGBClassifier()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(X, y):
    print('round ',i+1)
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

NameError: name 'StratifiedKFold' is not defined

# Classify Test Data

# Interpret Features