In [408]:
import re
import pandas as pd
import datetime
import operator
import plotly.io as pio
import numpy as np

In [341]:
# Upload CSV file
data = pd.read_csv('output.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

In [342]:
# Time Manipulation to Dataframe
data['timestamp'] = pd.to_datetime(data['ts'], unit='s')
data['just_date'] = data['timestamp'].dt.date
data['just_time'] = data['timestamp'].dt.time
data['diff'] = data.sort_values(['id.orig_h','timestamp']).groupby('id.orig_h')['timestamp'].diff()
data.columns

Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'trans_depth', 'method', 'host', 'uri', 'referrer', 'user_agent',
       'request_ body_len', 'response_ body_len', 'status_code', 'status_msg',
       'info_code', 'info_msg', 'filename', 'tags', 'username', 'password',
       'proxied', 'orig_fuids', 'orig_mime_types', 'resp_fuids',
       'resp_mime_types', 'timestamp', 'just_date', 'just_time', 'diff'],
      dtype='object')

In [379]:
# Preparing Dataset for Non Success Code
# Focusing on non sucess code as they might be searching for available sites and hitting errors most of the time
def getNonSuccessCode(data):
    data_response_p = data[data['status_code']>='299']
    return data_response_p

data_response_p = data[data['status_code']>='299']
top_ip = dict(data_response_p['id.orig_h'].value_counts())

#Set List for IPs that might be doing recon
potential_recon = []

In [381]:
# Plot Top IP for Non Success Code
count_unsuccessful_ip=dict(data_response_p.sort_values(['id.orig_h','just_date'],ascending=False).groupby(['id.orig_h','just_date']).size())
cleaned_count_unsuccessful_ip={}
for item in count_unsuccessful_ip.keys():
    cleaned_count_unsuccessful_ip[item[0]] = count_unsuccessful_ip[item]
    
# cleaned_count_unsuccessful_ip = sorted(cleaned_count_unsuccessful_ip.items(),key=operator.itemgetter(1),reverse=True)

potential_recon+=list(cleaned_count_unsuccessful_ip.keys())

cleaned_count_unsuccessful_ip
fig = dict({
    "data": [{"type": "bar",
              "x": list(cleaned_count_unsuccessful_ip.keys()),
              "y": list(cleaned_count_unsuccessful_ip.values())}],
    "layout": {"title": {"text": "Number of Unsuccesful Status Codes Per IP Address"}}
})

pio.show(fig)


In [386]:
# Keyword Detector from URI
def pattern_searcher(search_str:str, search_list:str,isBoolean:bool):

    search_obj = re.search(search_list, search_str)
    if search_obj :
        return_str = search_str[search_obj.start(): search_obj.end()]
    else:
        if(bool):
            return_str = 'False'
        else:
            return_str = 'NA'
    return return_str
# Edit your key words that you want to detect

directory_words = ['pwd', 'etc', 'passwd','manager','login','admin','lists','mail']
pattern = '|'.join(directory_words)
data['matched_directory'] = data['uri'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern,isBoolean=False))
data_response_p = getNonSuccessCode(data)
data_uri_directory = data_response_p[data_response_p['matched_directory'] != 'False']
ip_search_directory = dict(data_uri_directory['id.orig_h'].value_counts())
ip_search_directory = dict(sorted(ip_search_directory.items(), key=operator.itemgetter(1),reverse=True))


# Set top N to investigate
number_of_top_list_n = 5
ip_search_directory_top_n = {k: ip_search_directory[k] for k in list(ip_search_directory)[:number_of_top_list_n]}
potential_recon+=list(ip_search_directory_top_n.keys())



{'192.168.202.79': 2290, '192.168.202.96': 3}

In [401]:
# Check for vunerable ports and list IP accessing non-verified ports with non success code
valid_ports = ['80','20','21','22','23','25','53','67','68','110','143','443']
pattern = '|'.join(valid_ports)

data['is_valid'] = data['id.resp_p'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern,isBoolean=True))

data_response_p = getNonSuccessCode(data)
data_vunerable_ports = data_response_p[data_response_p['is_valid'] == 'False']
potential_recon+=list(data_vunerable_ports['id.orig_h'].unique())



In [440]:
#data_time_diff = data.dropna()

# data_time_diff_int=pd.to_numeric(data_time_diff['diff']).groupby(data_time_diff['id.orig_h'])

#data_time_diff.groupby('id.orig_h')['diff'].avg()
#pd.to_numeric(data_time_diff['diff']).groupby(['id.orig_h']).agg({'diff': {'avg': np.mean}})
#dtf=data_time_diff.groupby(['id.orig_h','just_date']).size()
#dtf
#data_time_diff['timediff_secs'] = pd.to_numeric(data_time_diff['diff'])
#data_time_diff


In [448]:
# Print out potential recon list
potental_recon_list = ' , '.join(list(set(potential_recon)))
print(potental_recon_list)

192.168.202.76 , 192.168.202.96 , 192.168.202.102 , 192.168.203.61 , 192.168.202.79 , 192.168.202.73 , 192.168.203.62 , 192.168.204.70 , 192.168.202.90
