In [23]:
import pyspark as spark
import datetime, time
import pandas as pd
from functools import reduce
from pyspark.sql.types import (StructField, StringType,FloatType, 
                               DoubleType, IntegerType, StructType,
                              DateType)
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as spDataFrame
from pyspark.ml.feature import Binarizer, OneHotEncoder, StringIndexer
import os
#from tokenizer import Tokenizer

In [2]:
def unionAll(*dfs):
    return reduce(spDataFrame.unionAll, dfs)

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName('Python Spark SQL basic example') \
        .config('spark.some.config.option','some-value') \
        .getOrCreate()

## Import DATA

In [4]:
proc_data_schema = [StructField('time',IntegerType(),True),
                   StructField('user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('proc_name',StringType(),True),
                   StructField('strt',StringType())]
proc_final_struc = StructType(fields = proc_data_schema)
proc = spark.read.csv('/users7/csegrad/smillett/capstone/Dataset/proc.txt',schema=proc_final_struc)

auth_data_schema = [StructField('time',IntegerType(),True),
                   StructField('src_user@domain',StringType(),True),
                   StructField('dest_user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('dest_comp',StringType(),True),
                   StructField('auth_type',StringType(),True),
                   StructField('logon_type',StringType(),True),
                   StructField('auth_orient',StringType(),True),
                   StructField('success',StringType(),True)]
auth_final_struc = StructType(fields = auth_data_schema)
auth = spark.read.csv('/users7/csegrad/smillett/capstone/Dataset/auth.txt',schema=auth_final_struc )

flows_data_schema = [StructField('time',IntegerType(),False),
                   StructField('dur',IntegerType(),False),
                   StructField('src_comp',StringType(),False),
                   StructField('src_port',StringType(),False),
                   StructField('dest_comp',StringType(),False),
                   StructField('dest_port',StringType(),False),
                   StructField('protocol',StringType(),False),
                   StructField('pkt_cnt',IntegerType(),False),
                   StructField('byte_cnt',IntegerType(),False)]
flows_final_struc = StructType(fields = flows_data_schema)
flows = spark.read.csv('/users7/csegrad/smillett/capstone/Dataset/flows.txt',schema=flows_final_struc )

dns_data_schema = [StructField('time',IntegerType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('cmp_resolved',StringType(),True)]
dns_final_struc = StructType(fields = dns_data_schema)
dns = spark.read.csv('/users7/csegrad/smillett/capstone/Dataset/dns.txt',schema=dns_final_struc)

redteam_data_schema = [StructField('time',IntegerType(),True),
                   StructField('user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('dest_comp',StringType(),True)]
redteam_final_struc = StructType(fields = redteam_data_schema)
redteam = spark.read.csv('/users7/csegrad/smillett/capstone/Dataset/redteam.txt',schema=redteam_final_struc)

## Transform Data

In [5]:
proc_split = F.split(proc['user@domain'],'@')
proc = proc.withColumn('src_user',proc_split.getItem(0))
proc = proc.withColumn('src_dmn',proc_split.getItem(1))
proc = proc.drop('user@domain')

proc = proc.withColumn('type',F.lit('Process'))

In [6]:
auth_src_split = F.split(auth['src_user@domain'],'@')
auth = auth.withColumn('src_user',auth_src_split.getItem(0))
auth = auth.withColumn('src_dmn',auth_src_split.getItem(1))

auth_dest_split = F.split(auth['dest_user@domain'],'@')
auth = auth.withColumn('dest_user',auth_dest_split.getItem(0))
auth = auth.withColumn('dest_dmn',auth_dest_split.getItem(1))

auth = auth.drop('src_user@domain','dest_user@domain')

auth = auth.withColumn('type',F.lit('Auth'))

In [7]:
redteam_split = F.split(redteam['user@domain'],'@')
redteam = redteam.withColumn('src_user',redteam_split.getItem(0))
redteam = redteam.withColumn('src_dmn',redteam_split.getItem(1))

redteam = redteam.drop('user@domain')

redteam = redteam.withColumn('type',F.lit('RedTeam'))

In [8]:

flows = flows.withColumn('avg_pkt_size', (flows['byte_cnt']/flows['pkt_cnt']).cast(DoubleType()))
flows = flows.na.drop(how='all')

flows = flows.withColumn('type',F.lit('DataFlow'))

In [30]:
# colum = proc.columns
# colum.sort()

# proc = proc.select(colum)
# redteam = redteam.select(colum)
# auth = auth.select(colum)
# flows = flows.select(colum)

In [31]:
# master = unionAll(redteam,auth,proc,flows)

In [32]:
# print(master.count())

In [33]:
# master.select('strt').sort('strt').show()

In [None]:
redteam1 = redteam.rdd.map(lambda x: (x['time'], time.strftime('%m/%d %H:%M:%S', time.gmtime(x['time']) ))).toDF(['time','timestam'])

## Data Show

In [34]:
auth.show()

+----+--------+---------+---------+----------+-----------+-------+---------------+-------+---------------+--------+----+
|time|src_comp|dest_comp|auth_type|logon_type|auth_orient|success|       src_user|src_dmn|      dest_user|dest_dmn|type|
+----+--------+---------+---------+----------+-----------+-------+---------------+-------+---------------+--------+----+
|   1|   C1250|     C586|     NTLM|   Network|      LogOn|Success|ANONYMOUS LOGON|   C586|ANONYMOUS LOGON|    C586|Auth|
|   1|    C586|     C586|        ?|   Network|     LogOff|Success|ANONYMOUS LOGON|   C586|ANONYMOUS LOGON|    C586|Auth|
|   1|    C988|     C988|        ?|   Network|     LogOff|Success|          C101$|   DOM1|          C101$|    DOM1|Auth|
|   1|   C1020|    C1020|Negotiate|   Service|      LogOn|Success|         C1020$|   DOM1|         SYSTEM|   C1020|Auth|
|   1|   C1021|     C625| Kerberos|   Network|      LogOn|Success|         C1021$|   DOM1|         C1021$|    DOM1|Auth|
|   1|   C1035|     C586| Kerber

In [35]:
redteam.show()

+------+--------+---------+--------+-------+-------+
|  time|src_comp|dest_comp|src_user|src_dmn|   type|
+------+--------+---------+--------+-------+-------+
|150885|  C17693|    C1003|    U620|   DOM1|RedTeam|
|151036|  C17693|     C305|    U748|   DOM1|RedTeam|
|151648|  C17693|     C728|    U748|   DOM1|RedTeam|
|151993|  C17693|    C1173|   U6115|   DOM1|RedTeam|
|153792|  C17693|     C294|    U636|   DOM1|RedTeam|
|155219|  C17693|    C5693|    U748|   DOM1|RedTeam|
|155399|  C17693|     C152|    U748|   DOM1|RedTeam|
|155460|  C17693|    C2341|    U748|   DOM1|RedTeam|
|155591|  C17693|     C332|    U748|   DOM1|RedTeam|
|156658|  C17693|    C4280|    U748|   DOM1|RedTeam|
|210086|  C18025|    C1493|    U748|   DOM1|RedTeam|
|210294|  C18025|    C1493|    U748|   DOM1|RedTeam|
|210312|  C18025|    C1493|    U748|   DOM1|RedTeam|
|218418|  C17693|     C504|    U748|   DOM1|RedTeam|
|227052|  C17693|     C148|    U748|   DOM1|RedTeam|
|227408|  C17693|     C148|    U748|   DOM1|Re

In [36]:
dns.filter((dns['src_comp']=='C17693')).show()

+-------+--------+------------+
|   time|src_comp|cmp_resolved|
+-------+--------+------------+
|2289058|  C17693|       C5808|
|2289207|  C17693|       C5808|
|2289208|  C17693|       C5808|
|2289209|  C17693|       C5808|
|2289210|  C17693|       C5808|
|2289212|  C17693|       C5808|
|2289213|  C17693|       C5808|
|2289216|  C17693|       C5808|
|2289217|  C17693|       C5808|
|2289218|  C17693|       C5808|
|2289219|  C17693|       C5808|
|2289239|  C17693|       C5808|
|2289240|  C17693|       C5808|
|2289241|  C17693|       C5808|
|2289243|  C17693|       C5808|
|2289264|  C17693|       C5808|
|2289266|  C17693|       C5808|
|2289269|  C17693|       C5808|
|2296865|  C17693|      C17679|
|2296867|  C17693|      C17679|
+-------+--------+------------+
only showing top 20 rows



In [14]:
flows.show()

+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|avg_pkt_size|
+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|   1|       0|   C1065|    389|    C3799|  N10451|       6|     10|   5323|      532.30|
|   1|       0|   C1423|  N1136|    C1707|      N1|       6|      5|    847|      169.40|
|   1|       0|   C1423|  N1142|    C1707|      N1|       6|      5|    847|      169.40|
|   1|       0|  C14909|  N8191|    C5720|    2049|       6|      1|     52|       52.00|
|   1|       0|  C14909|  N8192|    C5720|    2049|       6|      1|     52|       52.00|
|   1|       0|  C14909|  N8193|    C5720|    2049|       6|      1|     52|       52.00|
|   1|       0|   C1707|     N1|    C1423|   N1136|       6|      4|    414|      103.50|
|   1|       0|   C1707|     N1|    C1423|   N1142|       6|      4|    413|      103.25|
|   1|    

In [15]:

#master.groupBy('time').count().show()

failed_logon = auth.filter(auth.success=='Fail').groupby(auth.time,auth.auth_type,auth.logon_type).count().na.fill(0).sort('time')
failed_logon = failed_logon.select(F.col('time'),F.col('auth_type'),F.col('logon_type'),F.col('count').alias('fail_count'))

In [16]:
failed_logon.show()

+----+---------+--------------+----------+
|time|auth_type|    logon_type|fail_count|
+----+---------+--------------+----------+
|   1|        ?|             ?|         7|
|   1|Negotiate|         Batch|         1|
|   2|        ?|             ?|         2|
|   3|Negotiate|         Batch|         1|
|   3|        ?|             ?|         1|
|   4|Negotiate|         Batch|         1|
|   4|        ?|             ?|         1|
|   5|Negotiate|         Batch|         1|
|   6|        ?|             ?|         1|
|   7|Negotiate|         Batch|         1|
|   8|Negotiate|         Batch|         1|
|  11|     NTLM|       Network|         1|
|  12|        ?|             ?|         1|
|  13|        ?|             ?|         1|
|  14|     NTLM|       Network|         1|
|  15|Negotiate|         Batch|         1|
|  16|        ?|             ?|         1|
|  23|Negotiate|         Batch|         1|
|  25|Negotiate|NewCredentials|         1|
|  25|        ?|             ?|         2|
+----+-----

In [38]:
tgt_type =  auth.filter(auth.auth_orient=='TGT').groupby(auth.time).count().na.fill(0).sort('time')
tgs_type =  auth.filter(auth.auth_orient=='TGS').groupby(auth.time).count().na.fill(0).sort('time')

In [18]:
process_start = proc.groupby(proc.time).agg(F.when)
process_start = process_start.a
agg().sort('time')
#process_start = process_start.select(F.col('time'),F.col('strt'),F.col('count').alias('proc_change'))

In [29]:
stringIndexer = StringIndexer(inputCol='dest_comp', outputCol='successType')
model = stringIndexer.fit(redteam)
indexed = model.transform(redteam)

encoder = OneHotEncoder(inputCol='successType', outputCol='successVec')
encoded = encoder.transform(indexed)
encoded.show()


+------+--------+---------+--------+-------+-------+-----------+-----------------+
|  time|src_comp|dest_comp|src_user|src_dmn|   type|successType|       successVec|
+------+--------+---------+--------+-------+-------+-----------+-----------------+
|150885|  C17693|    C1003|    U620|   DOM1|RedTeam|       69.0| (300,[69],[1.0])|
|151036|  C17693|     C305|    U748|   DOM1|RedTeam|      296.0|(300,[296],[1.0])|
|151648|  C17693|     C728|    U748|   DOM1|RedTeam|      214.0|(300,[214],[1.0])|
|151993|  C17693|    C1173|   U6115|   DOM1|RedTeam|      294.0|(300,[294],[1.0])|
|153792|  C17693|     C294|    U636|   DOM1|RedTeam|       25.0| (300,[25],[1.0])|
|155219|  C17693|    C5693|    U748|   DOM1|RedTeam|      240.0|(300,[240],[1.0])|
|155399|  C17693|     C152|    U748|   DOM1|RedTeam|      293.0|(300,[293],[1.0])|
|155460|  C17693|    C2341|    U748|   DOM1|RedTeam|      217.0|(300,[217],[1.0])|
|155591|  C17693|     C332|    U748|   DOM1|RedTeam|      182.0|(300,[182],[1.0])|
|156

In [27]:
redteam.show()

+------+--------+---------+--------+-------+-------+
|  time|src_comp|dest_comp|src_user|src_dmn|   type|
+------+--------+---------+--------+-------+-------+
|150885|  C17693|    C1003|    U620|   DOM1|RedTeam|
|151036|  C17693|     C305|    U748|   DOM1|RedTeam|
|151648|  C17693|     C728|    U748|   DOM1|RedTeam|
|151993|  C17693|    C1173|   U6115|   DOM1|RedTeam|
|153792|  C17693|     C294|    U636|   DOM1|RedTeam|
|155219|  C17693|    C5693|    U748|   DOM1|RedTeam|
|155399|  C17693|     C152|    U748|   DOM1|RedTeam|
|155460|  C17693|    C2341|    U748|   DOM1|RedTeam|
|155591|  C17693|     C332|    U748|   DOM1|RedTeam|
|156658|  C17693|    C4280|    U748|   DOM1|RedTeam|
|210086|  C18025|    C1493|    U748|   DOM1|RedTeam|
|210294|  C18025|    C1493|    U748|   DOM1|RedTeam|
|210312|  C18025|    C1493|    U748|   DOM1|RedTeam|
|218418|  C17693|     C504|    U748|   DOM1|RedTeam|
|227052|  C17693|     C148|    U748|   DOM1|RedTeam|
|227408|  C17693|     C148|    U748|   DOM1|Re

In [33]:
auth.select('time','auth_orient','success').show()

+----+-----------+-------+
|time|auth_orient|success|
+----+-----------+-------+
|   1|      LogOn|Success|
|   1|     LogOff|Success|
|   1|     LogOff|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|     LogOff|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|     LogOff|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|     LogOff|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|     LogOff|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
|   1|      LogOn|Success|
+----+-----------+-------+
only showing top 20 rows



In [None]:
stringIndexer = StringIndexer(inputCol='strt', outputCol='successType')
model = stringIndexer.fit(proc)
indexed = model.transform(proc)

encoder = OneHotEncoder(inputCol='successType', outputCol='successVec')
encoded = encoder.transform(indexed)
encoded.show()

In [12]:
master_event = process_start.join(failed_logon, 'time','left')

In [19]:
process_start.show()

+----+-----+-----------+
|time| strt|proc_change|
+----+-----+-----------+
|   1|Start|        422|
|   1|  End|         24|
|   2|  End|         44|
|   2|Start|       1451|
|   3|  End|         42|
|   3|Start|       1555|
|   4|  End|         46|
|   4|Start|        723|
|   5|Start|        121|
|   5|  End|         38|
|   6|Start|         62|
|   6|  End|         39|
|   7|  End|         32|
|   7|Start|         45|
|   8|  End|          8|
|   8|Start|         36|
|   9|Start|         46|
|   9|  End|         20|
|  10|  End|          8|
|  10|Start|         30|
+----+-----+-----------+
only showing top 20 rows



In [13]:
master_event.show()

Py4JJavaError: An error occurred while calling o182.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 8 (showString at NativeMethodAccessorImpl.java:0) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException: Error in opening FileSegmentManagedBuffer{file=/tmp/blockmgr-621b0d97-28fd-4397-af15-9dff00553591/13/shuffle_0_87_0.data, offset=158734, length=2854} 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:528) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:423) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:62) 	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) 	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:30) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source) 	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) 	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) 	at org.apache.spark.scheduler.Task.run(Task.scala:109) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 	at java.lang.Thread.run(Thread.java:745) Caused by: java.io.IOException: Error in opening FileSegmentManagedBuffer{file=/tmp/blockmgr-621b0d97-28fd-4397-af15-9dff00553591/13/shuffle_0_87_0.data, offset=158734, length=2854} 	at org.apache.spark.network.buffer.FileSegmentManagedBuffer.createInputStream(FileSegmentManagedBuffer.java:114) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:416) 	... 20 more Caused by: java.io.FileNotFoundException: /tmp/blockmgr-621b0d97-28fd-4397-af15-9dff00553591/13/shuffle_0_87_0.data (Too many open files) 	at java.io.FileInputStream.open(Native Method) 	at java.io.FileInputStream.<init>(FileInputStream.java:138) 	at org.apache.spark.network.buffer.FileSegmentManagedBuffer.createInputStream(FileSegmentManagedBuffer.java:99) 	... 21 more 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1639)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1638)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1638)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1348)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1869)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3278)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2703)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:483)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [126]:
tgt_type.show()

+----+-----+
|time|count|
+----+-----+
|   1|   25|
|   2|    7|
|   3|    6|
|   4|    3|
|   6|    7|
|   7|    3|
|   8|    4|
|  10|    5|
|  11|    2|
|  12|    3|
|  13|    5|
|  14|    2|
|  15|    4|
|  16|    9|
|  17|    2|
|  18|    4|
|  19|    2|
|  20|    5|
|  21|    1|
|  22|    3|
+----+-----+
only showing top 20 rows



In [167]:
process_start.show()

+----+-----+
|time|count|
+----+-----+
|   1|  422|
|   2| 1451|
|   3| 1555|
|   4|  723|
|   5|  121|
|   6|   62|
|   7|   45|
|   8|   36|
|   9|   46|
|  10|   30|
|  11|   19|
|  12|   37|
|  13|   29|
|  14|   33|
|  15|   20|
|  16|   31|
|  17|   28|
|  18|   42|
|  19|   19|
|  20|   26|
+----+-----+
only showing top 20 rows



In [11]:
failed_logon.show()

+----+-----+
|time|count|
+----+-----+
|   1|    8|
|   2|    2|
|   3|    2|
|   4|    2|
|   5|    1|
|   6|    1|
|   7|    1|
|   8|    1|
|  11|    1|
|  12|    1|
|  13|    1|
|  14|    1|
|  15|    1|
|  16|    1|
|  23|    1|
|  25|    3|
|  26|    2|
|  29|    1|
|  30|    2|
|  31|    2|
+----+-----+
only showing top 20 rows



## Data Analysis

In [5]:
# proc_domains = proc.select('domain').distinct()
# proc_users = proc.select('user').distinct()

In [13]:
# proc_domains.coalesce(1).write.csv('domains.csv')
# proc_users.coalesce(1).write.csv('users.csv')

In [60]:
# proc.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in proc.columns)).show()

+----+-----------+---------+-----+
|time|user@domain|proc_name|start|
+----+-----------+---------+-----+
|   0|          0|        0|    0|
+----+-----------+---------+-----+



In [58]:
# flows.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in flows.columns)).show()

+----+--------+--------+-------+---------+--------+--------+-------+-------+
|time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|
+----+--------+--------+-------+---------+--------+--------+-------+-------+
| 530|     530|     530|    530|      530|     530|     530|    530|    530|
+----+--------+--------+-------+---------+--------+--------+-------+-------+



In [77]:
auth.select('success').distinct().show()

+-------+
|success|
+-------+
|Success|
|   Fail|
+-------+



In [59]:
auth.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in auth.columns)).show()

+----+---------------+----------------+--------+---------+---------+----------+----------------+-------+
|time|src_user@domain|dest_user@domain|src_comp|dest_comp|auth_type|logon_type|auth_orientation|success|
+----+---------------+----------------+--------+---------+---------+----------+----------------+-------+
|   0|              0|               0|       0|        0|        0|         0|               0|      0|
+----+---------------+----------------+--------+---------+---------+----------+----------------+-------+



In [32]:
flows.orderBy("avg_pkt_size").show()

+-----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
| time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|avg_pkt_size|
+-----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|41972|       0|   C1654|     80|   C13742|   N4427|       6|      2|     92|        46.0|
|41974|      38|   C1015|   N221|    C8681|   N2153|       6|      2|     92|        46.0|
|41972|       0|   C7632|   N294|   C20510|  N18962|       6|      1|     46|        46.0|
|41970|       0|   C8974|  N4126|    C5787|  N30556|       6|      1|     46|        46.0|
|41972|      38|   C1015|   N221|    C8964|   N2024|       6|      2|     92|        46.0|
|41971|       0|  C14402|  N9113|     C585|     139|       6|      1|     46|        46.0|
|41972|      60|  C11149|  N2801|    C2588|     N76|       6|      4|    184|        46.0|
|41971|       0|   C3873|     80|    C3959|   N3771|       6|      2|     92|        46.0|

In [49]:
flows.select([count(when(isnan(c)|col(c).isNull(), c)).alias(c) for c in flows.columns]).show()

+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|time|duration|src_comp|src_prt|dest_comp|dest_prt|protocol|pkt_cnt|byt_cnt|avg_pkt_size|
+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+
|   0|       0|       0|      0|        0|       0|       0|      0|      0|           0|
+----+--------+--------+-------+---------+--------+--------+-------+-------+------------+



In [54]:
dns.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in dns.columns)).show()

+----+--------+------------+
|time|src_comp|cmp_resolved|
+----+--------+------------+
|   0|       0|           0|
+----+--------+------------+



In [25]:
dns.printSchema()

root
 |-- time: date (nullable = true)
 |-- src_comp: string (nullable = true)
 |-- cmp_resolved: string (nullable = true)



In [10]:
proc.show()

+----+-----------+-----+---------+-----+
|time|user@domain| comp|proc_name|start|
+----+-----------+-----+---------+-----+
|   1|   C1$@DOM1|   C1|      P16|Start|
|   1|C1001$@DOM1|C1001|       P4|Start|
|   1|C1002$@DOM1|C1002|       P4|Start|
|   1|C1004$@DOM1|C1004|       P4|Start|
|   1|C1017$@DOM1|C1017|       P4|Start|
|   1|C1018$@DOM1|C1018|       P4|Start|
|   1|C1020$@DOM1|C1020|       P3|Start|
|   1|C1020$@DOM1|C1020|       P4|Start|
|   1|C1028$@DOM1|C1028|      P16|  End|
|   1|C1029$@DOM1|C1029|       P4|Start|
|   1|C1030$@DOM1|C1030|       P4|Start|
|   1|C1032$@DOM1|C1032|       P4|Start|
|   1|C1035$@DOM1|C1035|      P37|Start|
|   1|C1035$@DOM1|C1035|       P5|Start|
|   1|C1051$@DOM1|C1051|      P16|Start|
|   1|C1069$@DOM1|C1069|       P3|Start|
|   1|C1069$@DOM1|C1069|       P4|Start|
|   1|C1079$@DOM1|C1079|       P4|Start|
|   1|C1084$@DOM1|C1084|       P4|Start|
|   1|C1088$@DOM1|C1088|       P4|Start|
+----+-----------+-----+---------+-----+
only showing top

In [20]:
redteam.printSchema()

root
 |-- time: date (nullable = true)
 |-- user@domain: string (nullable = true)
 |-- src_comp: string (nullable = true)
 |-- dst_comp: string (nullable = true)



In [6]:
from pyspark.sql.functions import lit, unix_timestamp
start = datetime.date(2018,1,1)
#datetime.timestamp(2018,1,1,12,0,0)



In [7]:
datetime.datetime.fromtimestamp(time.mktime(start.timetuple()) + 228150)

datetime.datetime(2018, 1, 3, 15, 22, 30)

In [8]:
func = udf (lambda x: datetime.datetime.fromtimestamp(time.mktime(start.timetuple()) + x).date(),DateType() )

In [9]:
redteam1 = redteam.withColumn("timestam", redteam.select("time")),'yyyy-MM-dd HH:mm:ss').cast("timestamp") )

SyntaxError: invalid syntax (<ipython-input-9-48ac7615f935>, line 1)

In [10]:
import time

time_update = udf(lambda x: time.strftime('%m/%d %H:%M:%S', time.gmtime(x)))

#timestam = time.strftime('%m/%d %H:%M:%S', time.gmtime(redteam.select("time")))

In [23]:
redteam1.show()

+------+--------------+
|  time|      timestam|
+------+--------------+
|150885|01/02 17:54:45|
|151036|01/02 17:57:16|
|151648|01/02 18:07:28|
|151993|01/02 18:13:13|
|153792|01/02 18:43:12|
|155219|01/02 19:06:59|
|155399|01/02 19:09:59|
|155460|01/02 19:11:00|
|155591|01/02 19:13:11|
|156658|01/02 19:30:58|
|210086|01/03 10:21:26|
|210294|01/03 10:24:54|
|210312|01/03 10:25:12|
|218418|01/03 12:40:18|
|227052|01/03 15:04:12|
|227408|01/03 15:10:08|
|227520|01/03 15:12:00|
|227780|01/03 15:16:20|
|228024|01/03 15:20:24|
|228150|01/03 15:22:30|
+------+--------------+
only showing top 20 rows

