In [1]:
import datetime, time
import pandas as pd
import networkx as nx
from functools import reduce
from pyspark.sql import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField, StringType,FloatType, 
                               DoubleType, IntegerType, StructType,
                              DateType)
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as spDataFrame


from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import (Binarizer, OneHotEncoder, StringIndexer, HashingTF, 
                                Tokenizer, StandardScaler, VectorAssembler,
                               OneHotEncoder, StringIndexer, VectorIndexer)
from pyspark.ml.classification import LogisticRegression, LinearSVC, LinearSVCModel
from pyspark.ml.evaluation import HasFeaturesCol
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import os
import matplotlib.pyplot as plt
#from tokenizer import Tokenizer

This is to setup the sparksession. If additional resources need to be allocated it is done with this command

In [2]:

spark = SparkSession \
        .builder \
        .appName('Capstone Analysis') \
        .config("spark.driver.memory", "60g") \
        .config("spark.executor.memory", "60g") \
        .config("spark.executor.cores", "5") \
        .getOrCreate()

In [3]:
spark.version

'2.3.2'

## Import DATA

In [4]:
proc_data_schema = [StructField('time',IntegerType(),True),
                   StructField('user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('proc_name',StringType(),True),
                   StructField('strt',StringType())]
proc_final_struc = StructType(fields = proc_data_schema)
proc = spark.read.csv('../Dataset/proc.txt',schema=proc_final_struc)

auth_data_schema = [StructField('time',IntegerType(),True),
                   StructField('src_user@domain',StringType(),True),
                   StructField('dest_user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('dest_comp',StringType(),True),
                   StructField('auth_type',StringType(),True),
                   StructField('logon_type',StringType(),True),
                   StructField('auth_orient',StringType(),True),
                   StructField('success',StringType(),True)]
auth_final_struc = StructType(fields = auth_data_schema)
auth = spark.read.csv('../Dataset/auth.txt',schema=auth_final_struc )

flows_data_schema = [StructField('time',IntegerType(),False),
                   StructField('dur',IntegerType(),False),
                   StructField('src_comp',StringType(),False),
                   StructField('src_port',StringType(),False),
                   StructField('dest_comp',StringType(),False),
                   StructField('dest_port',StringType(),False),
                   StructField('protocol',StringType(),False),
                   StructField('pkt_cnt',IntegerType(),False),
                   StructField('byte_cnt',IntegerType(),False)]
flows_final_struc = StructType(fields = flows_data_schema)
flows = spark.read.csv('../Dataset/flows.txt',schema=flows_final_struc )

dns_data_schema = [StructField('time',IntegerType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('cmp_resolved',StringType(),True)]
dns_final_struc = StructType(fields = dns_data_schema)
dns = spark.read.csv('../Dataset/dns.txt',schema=dns_final_struc)

redteam_data_schema = [StructField('time',IntegerType(),True),
                   StructField('user@domain',StringType(),True),
                   StructField('src_comp',StringType(),True),
                   StructField('dest_comp',StringType(),True)]
redteam_final_struc = StructType(fields = redteam_data_schema)
redteam = spark.read.csv('../Dataset/redteam.txt',schema=redteam_final_struc)

This code is supposed to map the DNS connections into a network map

In [5]:
# dns_graph = new_df.toPandas()

In [6]:
# new_dns_graph = dns_graph.sample(5000)
# G = nx.from_pandas_edgelist(new_dns_graph, 'src_comp','cmp_resolved','count')

In [7]:
# from operator import itemgetter
# node_and_degree = G.degree()
# (largest_hub, degree) = sorted(node_and_degree, key=itemgetter(1))[-1]
# hub_ego = nx.ego_graph(G, largest_hub)
# # Draw graph
# pos = nx.spring_layout(hub_ego)


In [8]:
# nx.draw(G,pos,node_color='b', node_size=50, with_labels=False)
# nx.draw_networkx_nodes(hub_ego, pos, nodelist=[largest_hub], node_size=300, node_color='r')
# plt.show()

## Transform Data

This will be where we split all user@domain columns into user and domain columns

In [9]:
proc_split = F.split(proc['user@domain'],'@')
proc = proc.withColumn('src_user',proc_split.getItem(0))
proc = proc.withColumn('src_dmn',proc_split.getItem(1))
proc = proc.drop('user@domain')

proc = proc.withColumn('type',F.lit('Process'))

In [10]:
auth_src_split = F.split(auth['src_user@domain'],'@')
auth = auth.withColumn('src_user',auth_src_split.getItem(0))
auth = auth.withColumn('src_dmn',auth_src_split.getItem(1))

auth_dest_split = F.split(auth['dest_user@domain'],'@')
auth = auth.withColumn('dest_user',auth_dest_split.getItem(0))
auth = auth.withColumn('dest_dmn',auth_dest_split.getItem(1))

auth = auth.drop('src_user@domain','dest_user@domain')

auth = auth.withColumn('type',F.lit('Auth'))

In [11]:
redteam_split = F.split(redteam['user@domain'],'@')
redteam = redteam.withColumn('src_user',redteam_split.getItem(0))
redteam = redteam.withColumn('src_dmn',redteam_split.getItem(1))

redteam = redteam.drop('user@domain')

redteam = redteam.withColumn('type',F.lit('RedTeam'))

In [12]:

flows = flows.withColumn('avg_pkt_size', (flows['byte_cnt']/flows['pkt_cnt']).cast(DoubleType()))
flows = flows.na.drop(how='all')

flows = flows.withColumn('type',F.lit('DataFlow'))

# Test/Train split

In [13]:
split_days = 35
split_range = split_days * 3600 * 24

In [14]:
# #Split data into train/test segments. This will be done on the first number of days

# redteam = redteam.filter(redteam.time <= split_range)

# dns = dns.filter(dns.time <= split_range)

# proc = proc.filter(proc.time <= split_range)

# flows = flows.filter(flows.time <= split_range)

# auth = auth.filter(auth.time <= split_range)

In [15]:
# days = 20
# train_range = days * 3600 * 24

In [16]:
# #Split data into train/test segments. This will be done on the first number of days

# redteam_test = redteam.filter(redteam.time > train_range)
# redteam_train = redteam.filter(redteam.time <= train_range)

# dns_test = dns.filter(dns.time > train_range)
# dns_train = dns.filter(dns.time <= train_range)

# proc_test = proc.filter(proc.time > train_range)
# proc_train = proc.filter(proc.time <= train_range)

# flows_test = flows.filter(flows.time > train_range)
# flows_train = flows.filter(flows.time <= train_range)

# auth_test = auth.filter(auth.time > train_range)
# auth_train = auth.filter(auth.time <= train_range)

In [17]:
# colum = proc.columns
# colum.sort()

# proc = proc.select(colum)
# redteam = redteam.select(colum)
# auth = auth.select(colum)
# flows = flows.select(colum)

In [18]:
# master = unionAll(redteam,auth,proc,flows)

In [19]:
# print(master.count())

In [20]:
# master.select('strt').sort('strt').show()

This code was supposed to transform the time column from an int into a datetime data type

In [21]:
# redteam1 = redteam.select(F.to_date(redteam.time,'MM-dd HH:mm:ss').alias('date')).collect()
#redteam1 = redteam.rdd.map(lambda x: (x['time'], time.strftime('%m/%d %H:%M:%S', time.gmtime(x['time']) ))).toDF(['time','timestam'])

# DNS Data

In [22]:
# dns_train.show(5)

In [23]:
# F.pandas_udf()

In [24]:
def dns_extract(dns_dataset):
    dns_count = dns_dataset.groupby(dns_dataset.time,dns_dataset.src_comp).count().na.fill(0).sort('time')
    dns_count = dns_count.select(F.col('time'),F.col('src_comp'),F.col('count').alias('dns_count'))
    
    final_dns_dataset = dns_count
    
    temp_dns = first_dns_extract(dns_dataset)
    
    final_dns_dataset = final_dns_dataset.join(temp_dns,['time','src_comp'],'left')
    
    final_dns_dataset = final_dns_dataset.na.fill(0)
    
    final_dns_dataset = final_dns_dataset.sort(['time','src_comp'])
    
    return final_dns_dataset

In [25]:
def first_dns_extract(dns_dataset):
    newresolved = dns_dataset.groupby(dns_dataset.src_comp,dns_dataset.cmp_resolved).agg(F.min('time').alias('time')).sort('time')
    newresolved = newresolved.repartition(10).groupby(newresolved.time,newresolved.src_comp).count()
    newresolved = newresolved.repartition(10).select(F.col('time'),F.col('src_comp'),F.col('count').alias('newresolved_count')).sort('time')
    
    first_dns_dataset = newresolved
    return first_dns_dataset

In [26]:
dns_sample = dns_extract(dns)

In [27]:
start_time = time.time()
dns_sample.coalesce(1).write.csv('../Dataset/Output/total/dns_output_1.csv')
print("--- %s seconds ---" % (time.time() - start_time))

--- 78.87155055999756 seconds ---


In [28]:
# start_time = time.time()

# dns_sample.count()



In [29]:
# print("--- %s seconds ---" % (time.time() - start_time))

# Proc Data

In [30]:
# proc_train.select('strt').distinct().show()

In [31]:
# proc_subset = proc_train.filter((proc_train.time<2)&(proc_train.time>=0))

In [32]:
# proc_subset.show(10)

In [33]:
#proc_subset_test = proc_subset.groupby(proc_subset.time,proc_subset.src_comp,proc_subset.proc_name,proc_subset.strt).count()

In [34]:
# proc_subset_test.show(10)

In [35]:
# proc_subset_test = proc_subset.groupby(proc_subset.time,proc_subset.src_comp)\
#                     .agg(F.sum(F.when(proc_subset.strt=='Start',F.lit(1)).otherwise(F.lit(-1))).alias('LoggedOn'))\
#                     .withColumn('LoggedOn',F.greatest(F.lit(0),'LoggedOn'))\
#                     .sort('time','src_comp')


In [36]:
# proc_subset_test.show()

In [37]:
# proc_count = proc_train.groupby(proc_train.time,proc_train.src_comp).count().na.fill(0).sort('time')
# proc_count = proc_count.select(F.col('time'),F.col('src_comp'),F.col('count').alias('proc_total'))



In [38]:
# proc_count.show(10)

In [39]:
# proc_count.count()

In [40]:
# proc_exec = proc_train.filter(proc_train.strt=='Start').groupby(proc_train.time,proc_train.src_comp).count().na.fill(0).sort('time')
# proc_exec = proc_exec.select(F.col('time'),F.col('src_comp'),F.col('count').alias('proc_exec_total'))

In [41]:
# proc_exec.show(10)

In [42]:
# proc_exec.count()

In [43]:
def first_proc_extract(proc_dataset):
    newexecute = proc_dataset.filter(proc_dataset.strt=='Start').groupby(proc_dataset.src_comp,proc_dataset.proc_name).agg(F.min('time').alias('time')).sort('time')
    newexecute = newexecute.groupby(newexecute.time,newexecute.src_comp).count()
    newexecute = newexecute.select(F.col('time'),F.col('src_comp'),F.col('count').alias('newexecute_count')).sort('time')
    
    first_execute_dataset = newexecute
    return first_execute_dataset

In [44]:
def proc_running_extract(proc_dataset):
    proc_running = proc_dataset.groupby(proc_dataset.time,proc_dataset.src_comp)\
                    .agg(F.sum(F.when(proc_dataset.strt=='Start',F.lit(1)).otherwise(F.lit(-1))).alias('Proc_run'))\
                    .withColumn('Proc_run',F.greatest(F.lit(0),'Proc_run'))\
                    .sort('time','src_comp')
    return proc_running
    

In [45]:
def proc_extract(proc_dataset):
    proc_count = proc_dataset.groupby(proc_dataset.time,proc_dataset.src_comp).count().na.fill(0).sort('time')
    proc_count = proc_count.select(F.col('time'),F.col('src_comp'),F.col('count').alias('proc_total'))
    
    proc_exec = proc_dataset.filter(proc_dataset.strt=='Start').groupby(proc_dataset.time,proc_dataset.src_comp).count().na.fill(0).sort('time')
    proc_exec = proc_exec.select(F.col('time'),F.col('src_comp'),F.col('count').alias('proc_exec_total'))
    
    proc_running = proc_dataset.groupby(proc_dataset.time,proc_dataset.src_comp)\
                    .agg(F.sum(F.when(proc_dataset.strt=='Start',F.lit(1)).otherwise(F.lit(-1))).alias('Proc_run'))\
                    .withColumn('Proc_run',F.greatest(F.lit(0),'Proc_run'))\
                    .sort('time','src_comp')
    
    first_execute = first_proc_extract(proc_dataset)
    
    
    final_proc_dataset=proc_count
    
    final_proc_dataset = final_proc_dataset.join(proc_exec,['time','src_comp'],'left')
    
    final_proc_dataset = final_proc_dataset.join(first_execute,['time','src_comp'],'left')
    
    final_proc_dataset = final_proc_dataset.join(proc_running,['time','src_comp'],'left')
    
    final_proc_dataset = final_proc_dataset.na.fill(0)
    
    final_proc_dataset = final_proc_dataset.sort(['time','src_comp'])
    
    
    return final_proc_dataset

In [46]:
proc_sample = proc_extract(proc)

In [47]:
start_time = time.time()
proc_sample.coalesce(1).write.csv('../Dataset/Output/total/proc_output_1')
print("--- %s seconds ---" % (time.time() - start_time))

--- 721.9334058761597 seconds ---


## Auth Data

In [48]:
def auth_extract(auth_dataset):
    failed_logon = auth_dataset.filter(auth_dataset.success=='Fail').groupby(auth_dataset.time,auth_dataset.src_comp).count().na.fill(0).sort('time')
    failed_logon = failed_logon.select(F.col('time'),F.col('src_comp'),F.col('count').alias('fail_count'))
    
    
    final_auth_dataset = failed_logon
    return final_auth_dataset

In [49]:
def auth_loggedon_extract(auth_dataset):
    auth_loggedon = auth_dataset.filter((auth_dataset.success=="Success") & ((auth_dataset.auth_orient=='LogOn')|(auth_dataset.auth_orient=='LogOff'))).groupby(auth_dataset.time,auth_dataset.src_comp)\
                    .agg(F.sum(F.when(auth_dataset.auth_orient=='LogOn',F.lit(1)).otherwise(F.lit(-1))).alias('LoggedOn'))\
                    .withColumn('LoggedOn',F.greatest(F.lit(0),'LoggedOn'))\
                    .sort('time','src_comp')
    return auth_loggedon
    

In [50]:
def joined_auth_extract(auth_dataset):

    failed_logon = auth_dataset.filter(auth_dataset.success=='Fail').groupby(auth_dataset.time,auth_dataset.src_comp).count().na.fill(0).sort('time')
    failed_logon = failed_logon.select(F.col('time'),F.col('src_comp'),F.col('count').alias('fail_count'))
    
    auth_loggedon = auth_loggedon_extract(auth_dataset)
    
    final_auth_dataset = failed_logon.join(auth_loggedon,['time','src_comp'],'outer' )
    return final_auth_dataset
    

In [51]:
start_time = time.time()
auth_sample = joined_auth_extract(auth)

In [52]:
auth_sample.coalesce(1).write.csv('../Dataset/Output/total/auth_output_1.csv')
print("--- %s seconds ---" % (time.time() - start_time))

--- 1084.1272368431091 seconds ---


# Redteam Data

In [53]:
def redteam_extract(redteam_dataset):
    redteam_event = redteam_dataset.groupby(redteam_dataset.time, redteam_dataset.src_comp).count().na.fill(0).sort('time')
    redteam_event = redteam_event.select(F.col('time'),F.col('src_comp'),F.col('count').alias('redteam_event'))
    
    final_redteam_dataset = redteam_event
    return final_redteam_dataset

In [54]:
start_time = time.time()
redteam_sample = redteam_extract(redteam)
redteam_sample.coalesce(1).write.csv('../Dataset/Output/total/redteam_output_1.csv')
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.6539995670318604 seconds ---


# Flows Data

In [55]:
# flows_train.show(5)

In [56]:
# flows_data = flows_train.select('time','dur','src_comp','pkt_cnt','byte_cnt','avg_pkt_size')\
#                         .groupby('time','src_comp').agg(F.sum('dur').alias('dur'),F.sum('pkt_cnt').alias('pkt_cnt'),F.sum('byte_cnt').alias('byte_cnt'),F.avg('avg_pkt_size').alias('avg_pkt_size')).sort('time')

In [57]:
# flows_data.show(10)

In [58]:
def flows_extract(flows_dataset):
    flows_data = flows_dataset.select('time','dur','src_comp','pkt_cnt','byte_cnt','avg_pkt_size')\
                        .groupby('time','src_comp').agg(F.sum('dur').alias('dur'),F.sum('pkt_cnt').alias('pkt_cnt'),F.sum('byte_cnt').alias('byte_cnt'),F.avg('avg_pkt_size').alias('avg_pkt_size')).sort('time')
    
    final_flows_dataset = flows_data
    return final_flows_dataset

In [59]:
flows_sample = flows_extract(flows)

In [60]:
start_time = time.time()

# flows_sample.count()

In [61]:
flows_sample.coalesce(1).write.csv('../Dataset/Output/total/flows_output_1.csv')
print("--- %s seconds ---" % (time.time() - start_time))

--- 120.35145854949951 seconds ---


# Combine Data

In [2]:
redteam_columns = ['time', 'src_comp', 'redteam_event']
auth_columns = ['time', 'src_comp', 'fail_count', 'LoggedOn']
dns_columns = ['time', 'src_comp', 'dns_count', 'newresolved_count']
proc_columns = ['time', 'src_comp', 'proc_total', 'proc_exec_total', 'newexecute_count', 'Proc_run']
flows_columns = ['time', 'src_comp', 'dur', 'pkt_cnt', 'byte_cnt', 'avg_pkt_size']

In [3]:
directory = "../Dataset/Output/total/"
for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.csv'):
            filepath = root+'/'+file
            if "dns" in root:   
                pd_dns_df = pd.read_csv(filepath, names=dns_columns)
                print("imported ", filepath)
            elif "proc" in root:
                pd_proc_df = pd.read_csv(filepath, names=proc_columns)
                print("imported ", filepath)
            elif "flows" in root:   
                pd_flows_df = pd.read_csv(filepath, names=flows_columns)
                print("imported ", filepath)
            elif "redteam" in root:
                pd_redteam_df = pd.read_csv(filepath, names=redteam_columns)
                print("imported ", filepath)
            elif "auth" in root:
                pd_auth_df = pd.read_csv(filepath, names=auth_columns)
                print("imported ", filepath)
            else:
                print("Error importing ", file)

imported  ../Dataset/Output/total/dns_output_1.csv/part-00000-30c6be83-d671-4851-a9db-d44cb889d133-c000.csv
imported  ../Dataset/Output/total/proc_output_1/part-00000-df051b86-70c7-4781-9e2a-b28b227687a4-c000.csv
imported  ../Dataset/Output/total/auth_output_1.csv/part-00000-2efabf19-3e9d-4f43-a5d2-d702da7acbf3-c000.csv
imported  ../Dataset/Output/total/redteam_output_1.csv/part-00000-4ff2c392-5a39-4beb-bbe7-0db94a344c11-c000.csv
imported  ../Dataset/Output/total/flows_output_1.csv/part-00000-406f70d5-1b32-4b26-a26c-1bae6d6a5343-c000.csv


In [4]:
# # file = '../Dataset/Output/complete_df.csv'

# # start_time = time.time()


# n_size = pd_auth_df.time.max()

# splits = 15
# switch = 0

# for n in range(int(splits)):

#     lower = int(n_size/splits*n)
#     upper = int(n_size/splits*(n+1))
    
#     redteam = pd_redteam_df[(pd_redteam_df.time > lower) & (pd_redteam_df.time <= upper)]

#     dns = pd_dns_df[(pd_dns_df.time > lower) & (pd_dns_df.time <= upper)]

#     proc = pd_proc_df[(pd_proc_df.time > lower) & (pd_proc_df.time <= upper)]

#     flows = pd_flows_df[(pd_flows_df.time > lower) & (pd_flows_df.time <= upper)]

#     auth = pd_auth_df[(pd_auth_df.time > lower) & (pd_auth_df.time <= upper)]

# #     master_df = dns.merge(auth,on=['time','src_comp'],how='outer')
# #     master_df = master_df.merge(flows,on=['time','src_comp'],how='outer')
# #     master_df = master_df.merge(redteam,on=['time','src_comp'],how='outer')
# #     master_df = master_df.merge(proc,on=['time','src_comp'],how='outer')

# #     master_df=master_df.fillna(0)
    
# # #     master_df['time'] = pd.to_datetime(master_df['time'],unit='s').dt.strftime('%m/%d %H:%M:%S').head()

#     if switch == 0:
#         master_df.to_csv(file, index=False)
#         switch = 1 
#     else:
#         master_df.to_csv(file,mode='a',header=False, index=False )
#     print('Done with round ', (n+1),' of ',(splits), flush=True)    

# # print("--- %s seconds ---" % (time.time() - start_time))

# This block will save a csv with all data

In [None]:
start_time = time.time()
file = '../Dataset/Output/complete_1m_df.csv'

splits = 15

n_size = pd_auth_df.time.max()

splits = 15
switch = 0

for n in range(int(splits)):

    lower = int(n_size/splits*n)
    upper = int(n_size/splits*(n+1))
    
    redteam = pd_redteam_df[(pd_redteam_df.time > lower) & (pd_redteam_df.time <= upper)]

    dns = pd_dns_df[(pd_dns_df.time > lower) & (pd_dns_df.time <= upper)]

    proc = pd_proc_df[(pd_proc_df.time > lower) & (pd_proc_df.time <= upper)]

    flows = pd_flows_df[(pd_flows_df.time > lower) & (pd_flows_df.time <= upper)]

    auth = pd_auth_df[(pd_auth_df.time > lower) & (pd_auth_df.time <= upper)]

    dns['time'] = pd.to_datetime(dns['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
    flows['time'] = pd.to_datetime(flows['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
    auth['time'] = pd.to_datetime(auth['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
    redteam['time'] = pd.to_datetime(redteam['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
    proc['time'] = pd.to_datetime(proc['time'], unit='s').apply(lambda dt: dt.replace(year=2017))

    resample_pd_dns_df = dns.set_index('time').groupby('src_comp').resample('1min').agg({
                                                                           'dns_count':'sum',
                                                                           'newresolved_count':'sum'} ).reset_index().fillna(-999)
    resample_pd_flows_df = flows.set_index('time').groupby('src_comp').resample('1min').agg({
                                                                           'dur':'sum',
                                                                           'pkt_cnt':'sum',
                                                                           'byte_cnt':'sum',
                                                                           'avg_pkt_size':'mean'} ).reset_index().fillna(-999)
    resample_pd_auth_df = auth.set_index('time').groupby('src_comp').resample('1min').agg({
                                                                           'fail_count':'sum',
                                                                           'LoggedOn':'sum'} ).reset_index().fillna(-999)
    resample_pd_redteam_df = redteam.set_index('time').groupby('src_comp').resample('1min').agg({'redteam_event':'sum'
                                                                        } ).reset_index().fillna(-999)
    resample_pd_proc_df = proc.set_index('time').groupby('src_comp').resample('1min').agg({
                                                                           'proc_total':'sum',
                                                                           'proc_exec_total':'sum',
                                                                           'newexecute_count':'sum',
                                                                           'Proc_run':'sum'} ).reset_index().fillna(-999)

    master_1m_df = resample_pd_auth_df.merge(resample_pd_dns_df,on=['time','src_comp'],how='outer')
    master_1m_df = master_1m_df.merge(resample_pd_flows_df,on=['time','src_comp'],how='outer')
    master_1m_df = master_1m_df.merge(resample_pd_redteam_df,on=['time','src_comp'],how='outer')
    master_1m_df = master_1m_df.merge(resample_pd_proc_df,on=['time','src_comp'],how='outer')

    master_1m_df=master_1m_df.fillna(0)
    master_1m_df = master_1m_df.replace(-999, 0)

    resampled_df = master_1m_df.groupby('time').agg({'redteam_event':'sum',
                                                   'dns_count':'mean',
                                                   'newresolved_count':'mean',
                                                   'fail_count':'mean',
                                                   'LoggedOn':'mean',
                                                   'dur':'mean',
                                                   'pkt_cnt':'mean',
                                                   'byte_cnt':'mean',
                                                   'avg_pkt_size':'mean',
                                                   'proc_total':'mean',
                                                   'proc_exec_total':'mean',
                                                   'newexecute_count':'mean',
                                                   'Proc_run':'mean'} )
    #resampled_df.index = pd.to_datetime(resampled_df.index)
    if switch == 0:
        master_1m_df.to_csv(file)
        switch = 1 
    else:
        master_1m_df.to_csv(file,mode='a',header=False)
    print('Done with round ', (n+1),' of ',(splits), flush=True)    

    print("--- %s seconds ---" % (time.time() - start_time))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Done with round  1  of  15
--- 3423.0238881111145 seconds ---
Done with round  2  of  15
--- 7271.482785224915 seconds ---
Done with round  3  of  15
--- 11304.832357645035 seconds ---
Done with round  4  of  15
--- 15332.420239210129 seconds ---
Done with round  5  of  15
--- 18188.518383979797 seconds ---
Done with round  6  of  15
--- 22211.44558262825 seconds ---


In [74]:
# start_time = time.time()
# file = '../Dataset/Output/complete_5m_df.csv'

# splits = 15

# pd_dns_df['time'] = pd.to_datetime(pd_dns_df['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
# pd_flows_df['time'] = pd.to_datetime(pd_flows_df['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
# pd_auth_df['time'] = pd.to_datetime(pd_auth_df['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
# pd_redteam_df['time'] = pd.to_datetime(pd_redteam_df['time'], unit='s').apply(lambda dt: dt.replace(year=2017))
# pd_proc_df['time'] = pd.to_datetime(pd_proc_df['time'], unit='s').apply(lambda dt: dt.replace(year=2017))

# resample_pd_dns_df = pd_dns_df.set_index('time').groupby('src_comp').resample('5min').agg({
#                                                                        'dns_count':'sum',
#                                                                        'newresolved_count':'sum'} ).reset_index().fillna(-999)
# resample_pd_flows_df = pd_flows_df.set_index('time').groupby('src_comp').resample('5min').agg({
#                                                                        'dur':'sum',
#                                                                        'pkt_cnt':'sum',
#                                                                        'byte_cnt':'sum',
#                                                                        'avg_pkt_size':'mean'} ).reset_index().fillna(-999)
# resample_pd_auth_df = pd_auth_df.set_index('time').groupby('src_comp').resample('5min').agg({
#                                                                        'fail_count':'sum',
#                                                                        'LoggedOn':'sum'} ).reset_index().fillna(-999)
# resample_pd_redteam_df = pd_redteam_df.set_index('time').groupby('src_comp').resample('5min').agg({'redteam_event':'sum'
#                                                                     } ).reset_index().fillna(-999)
# resample_pd_proc_df = pd_proc_df.set_index('time').groupby('src_comp').resample('5min').agg({
#                                                                        'proc_total':'sum',
#                                                                        'proc_exec_total':'sum',
#                                                                        'newexecute_count':'sum',
#                                                                        'Proc_run':'sum'} ).reset_index().fillna(-999)

# master_5m_df = resample_pd_auth_df.merge(resample_pd_dns_df,on=['time','src_comp'],how='outer')
# master_5m_df = master_5m_df.merge(resample_pd_flows_df,on=['time','src_comp'],how='outer')
# master_5m_df = master_5m_df.merge(resample_pd_redteam_df,on=['time','src_comp'],how='outer')
# master_5m_df = master_5m_df.merge(resample_pd_proc_df,on=['time','src_comp'],how='outer')

# master_5m_df=master_5m_df.fillna(-999)

# master_5m_df.to_csv(file, index=False)
# print("--- %s seconds ---" % (time.time() - start_time))

--- 5004.983376026154 seconds ---


# Stop running up to here

# Create Classification model

In [None]:
cv = StratifiedKFold(n_splits=6)
classifier = xgb.XGBClassifier()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(X, y):
    print('round ',i+1)
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()