## Getting Frequency Distribution of Paths

For all pairs. All the Paths and the paths which were taken more than once.


In [3]:
import certifi
from elasticsearch import Elasticsearch

import r_utils as ut

import time
import csv 
import pandas as pd
import numpy as np
from collections import Counter
import multiprocessing as mp


#### Connecting to Kibana using ElasticSearch Python API

This object will be shared through the entire notebook

In [4]:
with open("../creds.key") as f:
    usrname = f.readline().strip()
    passwd = f.readline().strip()

# Connecting to Kibana
es = Elasticsearch(['atlas-kibana.mwt2.org:9200'], timeout=120, http_auth=(usrname, passwd), scheme= 'ssl')


#### Function Definitions

* getSourceDestinationPairs(): Returns the pairs of sources and destinations (IPv4/v6 Addresses).
* getPathCounts(): Returns the number of dictionary of path counts between the given source and destination in the specified time range.
* topk(): Returns a tuple of total number of paths and paths taken more than once.

In [5]:
def getSourceDestinationPairs(to_date, from_date):
    """
    Get all source and destination pairs
    present in the given time range 
    
    Args:
        to_date:  epoch_millis
        from_date: epoch_millis
    
    Returns:
        Datafame of all source destination pairs
    """
    
    query = {
        "size":0,
        "query":{
            "bool":{
                "must":[
                    {
                        "range":{
                            "timestamp":{
                                "gte":from_date,
                                "lte":to_date,
                                "format":"epoch_millis"
                            }
                        }
                    },
                    {
                        "term":{
                            "dest_production":{
                                "value":"true"
                            }
                        }
                    },
                    {
                        "term":{
                            "src_production":{
                                "value":"true"
                            }
                        }
                    }
                ]
            }
        },
        "aggs":{
            "sources":{
                "terms":{
                    "field":"src",
                    "size":9999
                },
                "aggs":{
                    "destinations":{
                        "terms":{
                            "field":"dest",
                            "size":9999
                        }
                    }
                }
            }
        }
    }

    data = es.search('ps_trace', body=query)
    
    sources = []
    destinations = []

    for source in data['aggregations']['sources']['buckets']:
        src = source['key']
        for destination in source['destinations']['buckets']:
            sources.append(src)
            destinations.append(destination['key'])
    
    return pd.DataFrame({"Source":sources,
                        "Destinations":destinations})

def getPathCounts(src_ip, dest_ip):
    """
    Returns a list of Counts of Paths taken from given source and destination

    Args:
        src_ip: Source IP, String [ex: "192.168.1.1"]
        dest_ip: Destination IP, String [ex: "192.168.1.5"]
    
    Returns:
        A list of dictionaries. The dictionary looks as follows:
        {
            'key':HASH VALUE,
            'doc_count': # of times path taken
        }
    """
    to_date = ut.getDateFormat()
    from_date = ut.getDateFormat(delta=90)

    query = {
        "size":0,
        "query":{
            "bool":{
                "must":[
                    {
                        "range":{
                            "timestamp":{
                                "gte":from_date,
                                "lte":to_date,
                                "format":"epoch_millis"
                            }
                        }
                    },
                    {
                        "term":{
                            "src":{
                                "value":src_ip
                            }
                        }
                    },
                    {
                        "term":{
                            "dest":{
                                "value":dest_ip
                            }
                        }
                    },
                    {
                        "term":{
                            "src_production":{
                                "value":"true"
                            }
                        }
                    },
                    {
                        "term":{
                            "dest_production":{
                                "value":"true"
                            }
                        }
                    }
                ]
            }
        },
        "aggs":{
            "HashCounts":{
                "terms":{
                    "field":"hash",
                    "size":4999
                }
            }
        }
    }

    try:
        data = es.search('ps_trace', body=query)
    except Exception as e:
        print(e, src_ip, dest_ip)
        return -1
    paths = data["aggregations"]["HashCounts"]["buckets"]
    
    if len(paths) == 0:
        return -1 
    else:
        return paths

def topk(src_ip, dest_ip,k=1):
    """
    Returns a  tuple of total paths and paths taken more than k times

    Args:
        src_ip: Source IP, String [ex: "192.168.1.1"]
        dest_ip: Destination IP, String [ex: "192.168.1.5"]


    Returns:
        A tuple of total unique paths and total of paths taken more than k times
        If there are no paths between soure and destination, None is returned
    """
    paths = getPathCounts(src_ip, dest_ip)

    if paths == -1:
        return None

    totalPaths = 0
    moreKPaths = 0

    totalPaths = len(paths)

    for item in paths:
        if item['doc_count'] > k:
            moreKPaths += 1
    
    return (totalPaths, moreKPaths)


#### Getting The Source and Detinatin Pairs and setting up the dataframe to store results as well.

In [6]:
src_dest_pairs = getSourceDestinationPairs(ut.getDateFormat(), ut.getDateFormat(delta=90))
src_dest_pairs['Total'] = np.zeros(src_dest_pairs.shape[0])
src_dest_pairs['MoreThanOne'] = np.zeros(src_dest_pairs.shape[0])
src_dest_pairs['Processed'] = np.zeros(src_dest_pairs.shape[0])
src_dest_pairs.head()


Unnamed: 0,Source,Destinations,Total,MoreThanOne,Processed
0,202.140.191.90,206.12.9.1,0.0,0.0,0.0
1,202.140.191.90,109.105.124.88,0.0,0.0,0.0
2,202.140.191.90,131.154.254.11,0.0,0.0,0.0
3,202.140.191.90,193.109.172.187,0.0,0.0,0.0
4,202.140.191.90,193.48.99.77,0.0,0.0,0.0


#### Getting Path Numbers for each source and pair destination using the function below.

It accepts a dataframe on which it performs the operations and stores the results.

In [21]:
def getPaths(args):
    id = args[0]
    df = args[1]
    time.sleep(id*0.01)
    start_time = time.time()
    av_time = 0
    print("Thread: {} Processing {} Pairs".format(id, df.shape[0]))
    for i in range(df.shape[0]):    
        av_st = time.time()
        if df.iloc[i,4] != 1:
            result = topk(df.iloc[i,0], df.iloc[i,1])
            if result is not None:
                df.iat[i,2] = result[0]
                df.iat[i,3] = result[1]

            df.iat[i,4] = 1
        av_time += time.time() - av_st 
        
        if i % 75 == 0:
            print("Processed: {:4d} by Thread: {:2d} | Average Time Per Pair: {:1.5f}s | Approx ETA: {:5.3f}s".format(i,id, av_time/75,(av_time/75)*(df.shape[0]-i)))
            av_time = 0

    print("Finished Thread:{:2d} | Time Taken:{} ".format(id, time.time() - start_time))
    return df


#### Processing the Pairs in Parallel to make pocessing faster. 

In [12]:
n_threads = 7

df_pieces = np.array_split(src_dest_pairs, n_threads)

pool = mp.Pool(n_threads)

arguments = []

for i in range(n_threads):
    arguments.append(i)
    arguments.append(df_pieces[i])

results = pool.map(getPaths, [[i, df_pieces[i]] for i in range(n_threads)])

pool.close()

Thread: 0 Processing 1267 Pairs
Thread: 1 Processing 1267 Pairs
Thread: 2 Processing 1267 Pairs
Thread: 4 Processing 1267 Pairs
Thread: 3 Processing 1267 Pairs
Thread: 5 Processing 1267 Pairs
Thread: 6 Processing 1266 Pairs
Processed:   0 by Thread: 3 | Average Time Per Pair:0.00721s | Approx ETA:9.136s
Processed:   0 by Thread: 4 | Average Time Per Pair:0.02769s | Approx ETA:35.084s
Processed:   0 by Thread: 5 | Average Time Per Pair:0.02754s | Approx ETA:34.890s
Processed:   0 by Thread: 6 | Average Time Per Pair:0.02754s | Approx ETA:34.863s
Processed:  75 by Thread: 5 | Average Time Per Pair:0.40815s | Approx ETA:486.515s
Processed:  75 by Thread: 6 | Average Time Per Pair:0.40814s | Approx ETA:486.099s
Processed:  75 by Thread: 4 | Average Time Per Pair:0.41361s | Approx ETA:493.018s
Processed: 150 by Thread: 5 | Average Time Per Pair:0.42110s | Approx ETA:470.367s
Processed: 150 by Thread: 6 | Average Time Per Pair:0.42595s | Approx ETA:475.361s
Processed: 150 by Thread: 4 | Aver

In [13]:
src_dest_results = pd.concat(results)
src_dest_results.head()

Unnamed: 0,Source,Destinations,Total,MoreThanOne,Processed
0,202.140.191.90,206.12.9.1,0.0,0.0,1.0
1,202.140.191.90,109.105.124.88,30.0,19.0,1.0
2,202.140.191.90,131.154.254.11,42.0,21.0,1.0
3,202.140.191.90,193.109.172.187,28.0,19.0,1.0
4,202.140.191.90,193.48.99.77,27.0,13.0,1.0


#### Looking for TimedOutPairs

In [27]:
print("Number of TimedOut Pairs: ", src_dest_results[src_dest_results['Total'] == 0].shape[0])

Number of TimedOut Pairs:  0


#### Fixing the TimedOut Pairs

In [22]:
for index, row in src_dest_results[src_dest_results['Total'] == 0].iterrows():
    result = topk(src_dest_results.iloc[index,0], src_dest_results.iloc[index,1])
    src_dest_results.iloc[index,2] = result[0]
    src_dest_results.iloc[index,3] = result[1]

print("Number of TimedOut Pairs: ", src_dest_results[src_dest_results['Total'] == 0].shape[0])

Unnamed: 0,Source,Destinations,Total,MoreThanOne,Processed


#### Adding a Column Containing the Number of Paths taken only once.

In [23]:
src_dest_results['Once'] = src_dest_results['Total'] - src_dest_results['MoreThanOne']

#### Saving data to file

In [32]:
src_dest_results.to_csv("../Results/SourceDestianonPairs.csv",index=False)