### network traffic

https://www.kaggle.com/crawford/computer-network-traffic

Each row consists of four columns:

date: yyyy-mm-dd (from 2006-07-01 through 2006-09-30)
l_ipn: local IP (coded as an integer from 0-9)
r_asn: remote ASN (an integer which identifies the remote ISP)
f: flows (count of connnections for that day)

compromises detected on, but may have happened before
Date : IP
08-24 : 1
09-04 : 5
09-18 : 4
09-26 : 3 6

1, 3, 4, 5, 6 are compromised.  the other 5 are not

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('~/datasets/ComputerNetworkTraffic/cs448b_ipasn.csv')

In [None]:
def set_ts_index(df):
    # convert the column (it's a string) to datetime type
    datetime_series = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='raise')

    # create datetime index passing the datetime series
    datetime_index = pd.DatetimeIndex(datetime_series)
    
    # assignment is required for index to change (IMP)
    df = df.set_index(datetime_index)
    df = df.drop(columns=['date'])
    return df

In [None]:
df = set_ts_index(df)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

### ASN with which communication before breach

In [None]:
df1 = df[(df.l_ipn == 1) & (df.index <= '2006-08-24') | 
         (df.l_ipn == 3) & (df.index <= '2006-09-26') |
         (df.l_ipn == 6) & (df.index <= '2006-09-26') |
         (df.l_ipn == 4) & (df.index <= '2006-09-18') |
         (df.l_ipn == 5) & (df.index <= '2006-09-04')
        ]

In [None]:
df1.describe()

In [None]:
df1.info()

In [None]:
df1.head()

In [None]:
df1.info()
len(df1.r_asn.unique())

In [None]:
df.info()
len(df.r_asn.unique())

### local ips with no breach

In [None]:
df2 = df[(df.l_ipn != 1) &
         (df.l_ipn != 3) &
         (df.l_ipn != 6) &
         (df.l_ipn != 4) &
         (df.l_ipn != 5)
        ]                  

In [None]:
df2.info()
len((df2.r_asn.unique()))

In [None]:
df2.r_asn.unique()

In [None]:
import numpy as np
innocent_asn = np.setdiff1d(df2.r_asn.unique(),df1.r_asn.unique())
# yields the ASN in `df2` (not compromised) that are NOT in `df1`
len(innocent_asn)

In [None]:
maybe_guilty_asn = np.setdiff1d(df1.r_asn.unique(),df2.r_asn.unique())
# yields the ASN in `df1` (compromised) that are NOT in `df2`
len(maybe_guilty_asn)

In [None]:
common_asn = np.intersect1d(df1.r_asn.unique(),df2.r_asn.unique()) 
len(common_asn)

### from

https://www.kaggle.com/ashutoshmaheshwari/bot-detection-prophet-and-luminol

In [None]:
dic = {'2006-08-24':1,'2006-09-04':5,'2006-09-18':4,'2006-09-26':3,'2006-09-26':6}
marked_anomalies = pd.DataFrame.from_dict(dic,orient='index')
marked_anomalies.reset_index(inplace = True)
marked_anomalies.columns = ['date','l_ipn']
print(marked_anomalies)

### Aggregating daily connections by date 


In [None]:
daily_aggregate = df.groupby(['date'])[['f']].sum()
daily_aggregate.reset_index(inplace = True)
daily_aggregate

In [None]:
daily_mean = round(daily_aggregate['f'].mean(),2)


In [None]:
plt.figure(figsize=(15,5))

# number of flows per day
plt.plot(daily_aggregate['date'],daily_aggregate['f'])

# add anomaly lines

for _x, ip in list(marked_anomalies[['date','l_ipn']].to_records(index=False)):
    plt.axvline(x=pd.to_datetime(_x), color='r' , label = 'Recorded Anomaly {}'.format(ip)) 


# add mean
plt.axhline(y= daily_mean, color='g', label = 'Mean Connections')

# rolling mean line
plt.plot(daily_aggregate['date'],daily_aggregate['f'].rolling(7).mean(), label = '7 days Rolling average')

# fill between 
plt.xticks(daily_aggregate['date'][::2],  rotation='vertical')
plt.yscale('log')
plt.xlabel('date')
plt.ylabel('Connection')
plt.title('Daily Aggregate Connections')
plt.fill_between(daily_aggregate['date'],daily_aggregate['f'],color='aqua')
plt.legend()
plt.show()

### Aggregating daily connections by date and local IP

In [None]:
daily_aggregate_l_ipn = df.groupby(['l_ipn','date'])[['f']].sum()
daily_aggregate_l_ipn.reset_index(inplace= True)
daily_aggregate_l_ipn

In [None]:
import matplotlib.dates as mdates

# nrows x ncols = number of local ip
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 15))

plot_row = 0
plot_col = 0

for i in range(df['l_ipn'].nunique()):
    temp = daily_aggregate_l_ipn[daily_aggregate_l_ipn['l_ipn'] == i]
    axes[plot_row,plot_col].set_title(i)
    axes[plot_row,plot_col].set_xlabel('date')
    axes[plot_row,plot_col].set_ylabel('connections')
    
    axes[plot_row,plot_col].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    
    axes[plot_row,plot_col].plot(pd.to_datetime(temp['date']),temp['f'], color = 'salmon')
    axes[plot_row,plot_col].get_xaxis().set_visible(False)
    axes[plot_row,plot_col].fill_between(temp['date'],temp['f'], color='peachpuff')
    

    plot_col = plot_col + 1
    if(plot_col == 2):
        plot_row = plot_row + 1
        plot_col = 0
plt.show()

### aggregate flows by ASN

In [None]:
daily_aggregate_r_asn = df.groupby(['r_asn'])[['f']].sum()
daily_aggregate_r_asn.reset_index(inplace = True)

In [None]:
plt.figure(figsize=(10,5))
plt.title(i)
plt.xlabel('r_asn')
plt.ylabel('connections')
plt.xticks(rotation='vertical')
#n_bins =  daily_aggregate_r_asn['r_asn']
#plt.hist(daily_aggregate_r_asn['f'], n_bins, histtype ='bar')
plt.plot(daily_aggregate_r_asn['r_asn'],daily_aggregate_r_asn['f'], color = 'salmon')
plt.show()