# SETUP

### Install and Upgrade `pip` packages if needed

In [None]:
!pip install --upgrade --user pip
!pip install gitpython
!pip install pyvis

### Import modules in this Jupyter Notebook

In [69]:
#standard modules to use and manipulate dataframes
import numpy as np
import pandas as pd
#git to import needed repository
import git
#to substitute git
import requests, zipfile, io
#pathlib to run code against locations on disk
import pathlib
#re for regular expresion based extracts
import re
#module to copy a value a in a dataframe, didnt find an easier way
from itertools import cycle
#below modules to create the graphs and computations on the graphs
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
#for the select box
import ipywidgets as widgets
from ipywidgets import interact#, interact_manual

### Dataframe Display Formatting

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

# INGEST AND MANIPULATE DATA

### Git clone OSQuery repository

In [70]:
url = "https://github.com/osquery/osquery.git"
#Set dir
dir_git = re.search( r'^.*/(.+).git$', url, re.M|re.I).group(1)
p = pathlib.Path(dir_git)

In [71]:
if p.exists():
    print ("Git folder already exists. Skipping...")
else:
    print ("Git cloning folder...")
    git.Git(".").clone(url)


Git folder already exists. Skipping...


## TODO: need to work on this file structure needs to be removed

In [None]:
#Based on https://twitter.com/curi0usJack/status/1255702362225811457?s=20
#get the API data of the latest release
url_github_latest = "https://api.github.com/repos/osquery/osquery/releases/latest"
response = requests.get(url_github_latest).json() 
  
#response was in json and put in dict so can be called easily
url_github_dl = response['zipball_url']

#get zipped content and unzip
github_content = requests.get(url_github_dl, stream=True)
zippedcontent = zipfile.ZipFile(io.BytesIO(github_content.content))
#zippedcontent.namelist()

listOfFileNames = zippedcontent.namelist()
# Iterate over the file names
for fileName in listOfFileNames:
   # Check filename endswith csv
   if fileName.endswith('.table'):
       # Extract a single file from zip
       zippedcontent.extract(fileName, 'osquery-tables')
#zippedcontent.extractall()

### Extract table names and columns from the OSQuery table files

In [72]:
def osquery_table_extract(dir_git):
    table_columns = []
    for path in p.rglob("*.table"):
        if path.is_file() and 'example' not in path.stem:
            cf = open(path, "r", encoding="utf-8").read()
            tline = re.findall(r'table_name\(\"(\w+)\".*\)',cf)
            #below line is used to find all columns that do not have attribute hidden=True
            clines = re.findall(r'Column\(\"(\w+?)\".+?(\n)?.+? (?!hidden=True)\S+\),$',cf,re.M)
            #regex returns tuples because of the multiline matching so with list comprehesion turning it back in a list
            clines = [i[0] for i in clines]
            tcList =  list(zip(cycle(tline),clines))
        table_columns.append(tcList)
    return(table_columns)

Get all data from function and put in DF.

In [73]:
extract = osquery_table_extract(dir_git)
extract_df = pd.DataFrame([t for lst in extract for t in lst], columns = ['Table','Column'])

#Check count of tables to be sure all have been processed.
#Difference in count table names against osquery website, I filtered out example.table as it is just an example table.
#print('Tables', extract_df.Table.nunique())
extract_df

Unnamed: 0,Table,Column
0,arp_cache,address
1,arp_cache,mac
2,arp_cache,interface
3,arp_cache,permanent
4,atom_packages,name
...,...,...
2127,yara_events,matches
2128,yara_events,count
2129,yara_events,strings
2130,yara_events,tags


### Add OS to table based on cMakelists file

Keep working on crashes vs windows_crashes issue otherwise workaround change the data manually

In [74]:
def osquery_table_os(tname):
    tname_cmake = 0
    with open(str(dir_git)+'/specs/cMakelists.txt', 'r') as read_obj:
        for line in read_obj:
            if tname in line:
                if '/' in line:
                    #used to delete part of string that can give double matches bases on tables names that have same start or end.
                    tname_cmake = re.search( r'^.*/(.*)$', line, re.M|re.I).group(1)
                else:
                    tname_cmake = line.strip()
    return(tname_cmake)

In [75]:
tname_list = []
for tname in extract_df['Table']:
    table_os = osquery_table_os(tname)
    if ':' in str(table_os):
        t_os = re.search(r'^.+?:(.+?)"$', table_os, re.M|re.I).group(1)
        tname_list.append(t_os)
    elif '0' in str(table_os):
        #not in the cmake list file but on the website so manually add the OS
        t_os = 'no_os'
        tname_list.append(t_os)
    else:
        t_os = 'linux,macos,freebsd,windows'
        tname_list.append(t_os)
    
extract_df_os = extract_df
extract_df_os['OS'] = tname_list 

Workaround for issue that some tables did not get the correct OS assignment, fix this later in the re.search in the osquery_table_os function.  
Tables that have no_os need manual assignment also

In [76]:
extract_df_os.loc[extract_df_os.Table == 'crashes', 'OS'] = 'macos'
extract_df_os.loc[extract_df_os.Table == 'azure_instance_metadata', 'OS'] = 'linux,macos,freebsd,windows'
extract_df_os.loc[extract_df_os.Table == 'azure_instance_tags', 'OS'] = 'linux,macos,freebsd,windows'
extract_df_os.loc[extract_df_os.Table == 'wifi_survey', 'OS'] = 'macos'

#to check if there are any no_os from the CmakeLists file, if so make the changes manually based on osquery schema website
extract_df_os.loc[extract_df_os.OS == 'no_os']

Unnamed: 0,Table,Column,OS


### Create separate dataframes based on OS

In [104]:
column_edge = 'Column'
column_ID = 'Table'

#Windows
windows_df = extract_df_os.loc[pd.np.where(extract_df_os.OS.str.contains("windows"))]
#print(len(windows_df))

#Linux
linux_df = extract_df_os.loc[pd.np.where(extract_df_os.OS.str.contains("linux"))]
#print(len(linux_df))

#MacOS
macos_df = extract_df_os.loc[pd.np.where(extract_df_os.OS.str.contains("macos"))]
#print(len(macos_df))

#FreeBSD
freebsd_df = extract_df_os.loc[pd.np.where(extract_df_os.OS.str.contains("freebsd"))]
#print(len(freebsd_df))

#windows_df.tail(10)

# GRAPHS

Check the most common `Columns` to filter out common names that will not be able to JOIN.  
Used below to start creating the ignore_list, finished it visually checking the paths.  

In [85]:
#ignore_list to filter out Columns of no interest later on before creating the graph
ignore_list = ['name','path','type','version','size','version','description','status','state','label','class','source','device','mode','value','result','hardware_model','manufacturer','query','model','device_id','action','script_text','time','enabled',
               'date','caption','publisher','active','autoupdate','flags','comment','data','registry','author','directory','license','summary','permissions'] 
#key? Op dit moment uitgefilterd tot alleen >= 10x, misschien toch path erbij maar voor nu nog niet gedaan veel ruis ,'path' voorbeeld http://www.osdfcon.org/presentations/2016/Facebook-osquery.pdf

### Determine shortest paths from A to B

In [64]:
def shortest_path(df_OS,s,t):
    P = nx.from_pandas_edgelist(df=df_OS, source='Table', target='Column')
    path_list = []
    if nx.has_path(P, s, t):
        path_list = nx.shortest_path(P, source=s, target=t)
    else:
        print('No path available.')
    
    Q = P.subgraph(path_list)
    return (Q, path_list)

In [67]:
# TODO: add other OS filtered tables, maybe create a dropdown for OS
windows_df_filtered_tb = windows_df[~windows_df['Column'].isin(ignore_list)]

@interact
def corr_graph(Source=list(windows_df_filtered_tb.Table.unique()), Destination=list(windows_df_filtered_tb.Table.unique())):
    #print(f"testing", column1)
    sp_graph, sp_list = shortest_path(windows_df_filtered_tb,Source,Destination) #sp_graph, 
    if sp_list is not None:
        #print([p for p in sp_list])
        sp_gr=Network(notebook=True, bgcolor="#222222", font_color="white")
        sp_gr.add_nodes(sp_list, title=sp_list)# , value='1', color='b'
        sp_gr.barnes_hut()
        sp_gr.from_nx(sp_graph)
        return(sp_gr.show("sp_graph.html"))

interactive(children=(Dropdown(description='Source', options=('arp_cache', 'azure_instance_metadata', 'azure_i…

### Function to create the graph and all its properties 

In [83]:
def create_OS_graph(df_OS):
    #create nx node graph form dataframe
    G = nx.from_pandas_edgelist(df=df_OS, source='Table', target='Column')
    #initiliaze lists to use for append
    colors = []
    sizes = []
    selected_nodes_list = []
    selected_nodes_list_H = []
    
    #calculate all degrees of separation for the nodes, so how many connection does each node have
    degree = nx.degree(G)

    #iterate through nodes, if node in Table column than check if node has connections, if so add to list, if no connections discard.
    #if node not in Table list than it would be in the Column table and if has more than 1 connection than at to list
    for node in G:
        if node in df_OS.Table.values:
            if (degree(node) > 0):
                #print('Table ' + node + ' ' + str(degree(node)))
                selected_nodes_list.append(node)
        else: 
            #column has always at least connection to it's own table that is why need to check for more than 1 connection
            if (degree(node) > 1):
                #print('Column ' + node + ' ' + str(degree(node)))
                selected_nodes_list.append(node)

    #create subgraph based on filtering above
    H = G.subgraph(selected_nodes_list)
    degree_H = nx.degree(H)
    
    #run the same logic as above to filter out tables and columns that were left after removals above
    #also add color dependent on column and size dependent on how many connections
    for node in H:
        if node in df_OS.Table.values:
            if (degree_H(node) == 1):
                #print('Table ' + node + ' ' + str(degree_H(node)))
                selected_nodes_list_H.append(node)
                colors.append("orange") #lightblue
                sizes.append(300)
            elif (degree_H(node) > 1):
                selected_nodes_list_H.append(node)
                colors.append("lightgreen")
                sizes.append(H.degree(node) * 700)
        else: 
            if (degree_H(node) > 1):
                #print('Column ' + node + ' ' + str(degree_H(node)))
                selected_nodes_list_H.append(node)
                colors.append("red")
                sizes.append(H.degree(node) * 1000)

    I = H.subgraph(selected_nodes_list_H)
    return(I, colors, sizes, selected_nodes_list_H)

In [89]:
column_count = extract_df['Column'].value_counts()
column_for_joins = column_count[column_count > 1]
column_for_joins.head(10)

name           90
path           73
type           44
version        43
size           28
uid            28
pid            25
time           22
description    22
key            21
Name: Column, dtype: int64

### ALL GRAPH

In [81]:
OS_graph, colors, sizes, nodelist = create_OS_graph(extract_df_os)

#print(OS_graph.number_of_nodes())
#print(OS_graph.number_of_edges())

gr=Network(height=800, width=1200, notebook=True, bgcolor="#222222", font_color="white")
gr.add_nodes(nodelist, value=sizes, title=nodelist, color=colors)
gr.barnes_hut()
gr.from_nx(OS_graph)
gr.show("osquery_tables_OS_ALL_graph.html")

### Windows Graph

In [92]:
windows_df_filtered = windows_df[~windows_df['Column'].isin(ignore_list)]
OS_graph, colors, sizes, nodelist = create_OS_graph(windows_df_filtered)

print('Nodes:', OS_graph.number_of_nodes(), 'Edges:', OS_graph.number_of_edges())

gr=Network(height=800, width=1200, notebook=True, bgcolor="#222222", font_color="white")
gr.add_nodes(nodelist, value=sizes, title=nodelist, color=colors)
gr.barnes_hut()
gr.from_nx(OS_graph)
gr.show("osquery_tables_OS_win_graph.html")

Nodes: 84 Edges: 98


### Linux Graph

In [None]:
linux_df_filtered = linux_df[~linux_df['Column'].isin(ignore_list)]
#linux_df_uid=linux_df[linux_df.Column.str.contains('uid')]
OS_graph, colors, sizes, nodelist = create_OS_graph(linux_df_filtered)
#nx.draw_networkx(OS_graph,with_labels=True,node_color=colors,node_size=sizes)

print('Nodes:', OS_graph.number_of_nodes(), 'Edges:', OS_graph.number_of_edges())

gr=Network(height=800, width=1000, notebook=True, bgcolor="#222222", font_color="white")
gr.add_nodes(nodelist, value=sizes, title=nodelist, color=colors)
gr.barnes_hut()
gr.from_nx(OS_graph)
gr.show("osquery_tables_OS_lin_graph.html")

### MacOS Graph

In [None]:
macos_df_filtered = macos_df[~macos_df['Column'].isin(ignore_list)]
#macos_df_uid=macos_df[macos_df.Column.str.contains('uid')]

OS_graph, colors, sizes, nodelist = create_OS_graph(macos_df_filtered)
#nx.draw_networkx(OS_graph,with_labels=True,node_color=colors,node_size=sizes)

print('Nodes:', OS_graph.number_of_nodes(), 'Edges:', OS_graph.number_of_edges())
#print(OS_graph.number_of_edges())

gr=Network(height=800, width=1000, notebook=True, bgcolor="#222222", font_color="white")
gr.add_nodes(nodelist, value=sizes, title=nodelist, color=colors)
gr.barnes_hut()
gr.from_nx(OS_graph)
gr.show("osquery_tables_OS_mac_graph.html")

### FreeBSD Graph

In [None]:
freebsd_df_filtered = freebsd_df[~freebsd_df['Column'].isin(ignore_list)]
#freebsd_df_uid=freebsd_df[freebsd_df.Column.str.contains('uid')]

OS_graph, colors, sizes, nodelist = create_OS_graph(freebsd_df_filtered)

print('Nodes:', OS_graph.number_of_nodes(), 'Edges:', OS_graph.number_of_edges())

gr=Network(height=800, width=1000, notebook=True, bgcolor="#222222", font_color="white")
gr.add_nodes(nodelist, value=sizes, title=nodelist, color=colors)
gr.barnes_hut()
gr.from_nx(OS_graph)
gr.show("osquery_tables_OS_freebsd_graph.html")

### All shortest paths WIP

In [112]:
def shortest_path_source(df_OS,s):
    P = nx.from_pandas_edgelist(df=df_OS, source='Table', target='Column')
    path_list_s = []
    #if nx.has_path(P, s, t):
    path_list_s = nx.single_source_shortest_path(P, source=s, cutoff=2)
    #else:
    #   print('No path available.')
    
    Q = P.subgraph(path_list_s)
    return (Q, path_list_s)

In [113]:
# TODO: add other OS filtered tables, maybe create a dropdown for OS
windows_df_filtered_tb = windows_df[~windows_df['Column'].isin(ignore_list)]

@interact
def corr_graph_source(Source=list(windows_df_filtered_tb.Table.unique())):
    #print(f"testing", column1)
    sp_graph_s, sp_list_s = shortest_path_source(windows_df_filtered_tb,Source) #sp_graph, 
    #if sp_list is not None:
        #print([p for p in sp_list])
    sp_gr_s=Network(notebook=True, bgcolor="#222222", font_color="white")
    #sp_gr_s.add_nodes(sp_list_s, title=sp_list_s)# , value='1', color='b'
    sp_gr_s.barnes_hut()
    sp_gr_s.from_nx(sp_graph_s)
    return(sp_gr_s.show("sp_graph_s.html"))

interactive(children=(Dropdown(description='Source', options=('arp_cache', 'azure_instance_metadata', 'azure_i…

In [None]:
#Source:
#https://towardsdatascience.com/getting-started-with-graph-analysis-in-python-with-pandas-and-networkx-5e2d2f82f18e
#https://stackoverflow.com/questions/55342586/assign-color-to-networkx-node-based-on-column-name
#https://pyvis.readthedocs.io/en/latest/tutorial.html