## Run Flawfinder

In [5]:
import pathlib
import subprocess
import pandas as pd
from io import StringIO
from os.path import join
import re
from tqdm import tqdm

#
#
files_root = '/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection'

files = []
for ext in ('*.c', '*.cpp', '*.cxx'):
    files.extend(pathlib.Path(files_root).rglob(ext))

ff_command = 'flawfinder --falsepositive --minlevel=3 --dataonly --quiet --csv '
ff = pd.DataFrame(columns=['path', 'line', 'cwes', 'context'])

for path in tqdm(files):
    # run flawfinder and catch csv output
    output = StringIO( subprocess.run(ff_command + str(path), shell=True, capture_output=True).stdout.decode('utf-8') )
    output = pd.read_csv(output)
    
    output.rename(columns={'File':'path', 'Line':'line', 'CWEs':'cwes', 'Context':'context'}, inplace=True)
    output = output[['path', 'line', 'cwes', 'context']]
    
    
    # skip if flawfinder finds nothing
    output = output[ output.cwes.notnull() ]
    if output.empty:
        continue
    
    # split findings of multiple CWEs
    #for cwes in output.cwes.str.split(', |/'):
    #    for cwe in cwes:
    #        output[cwe] = 1
    
    ff = pd.concat([ff, output], sort=False)

ff = ff.reset_index(drop=True)
ff = ff[ff.cwes.notnull()]
ff


  0%|          | 0/781 [00:00<?, ?it/s][A
  0%|          | 2/781 [00:00<01:13, 10.67it/s][A
  1%|          | 4/781 [00:00<01:12, 10.74it/s][A
  1%|          | 6/781 [00:00<01:11, 10.77it/s][A
  1%|          | 8/781 [00:00<01:12, 10.67it/s][A
  1%|▏         | 10/781 [00:00<01:11, 10.79it/s][A
  2%|▏         | 12/781 [00:01<01:11, 10.80it/s][A
  2%|▏         | 14/781 [00:01<01:10, 10.89it/s][A
  2%|▏         | 16/781 [00:01<01:10, 10.83it/s][A
  2%|▏         | 18/781 [00:01<01:10, 10.76it/s][A
  3%|▎         | 20/781 [00:01<01:10, 10.74it/s][A
  3%|▎         | 22/781 [00:02<01:10, 10.78it/s][A
  3%|▎         | 24/781 [00:02<01:09, 10.93it/s][A
  3%|▎         | 26/781 [00:02<01:09, 10.83it/s][A

KeyboardInterrupt: 

## Split found CWEs into separate label columns

In [6]:
import re

pd.set_option('max_colwidth', 250)
df = ff.copy()[['path', 'line', 'cwes', 'context']]

def extract_cwes(row):
    cwes = re.split(', |/', row.cwes)
    cwe_series =pd.Series()
    for cwe in cwes:
        #print(cwe)
        cwe = cwe.replace('!', '')
        cwe_series[cwe] = 1
    
    return cwe_series
    
pd.set_option('max_colwidth', 100)

df = pd.concat([df, df.apply(extract_cwes, axis='columns') ], axis='columns')
df

  


Unnamed: 0,path,line,cwes,context,CWE-327
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,265,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,209,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,91,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,201,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,265,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,274,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,269,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,100,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,267,CWE-327,srand( (unsigned)time(NULL) );,1
0,/mnt/md0/user/scheuererra68323/testset_jtt/C/testcases/CWE90_LDAP_Injection/CWE90_LDAP_Injection...,207,CWE-327,srand( (unsigned)time(NULL) );,1


## Results

In [7]:
print('flawfinder found {} faulty functions in {} files'.format(len(ff.index), len(files)))
print('found CWEs: {}'.format(len(df.columns) - 4))

flawfinder found 19 faulty functions in 781 files
found CWEs: 1



  3%|▎         | 26/781 [00:20<01:09, 10.83it/s][A

## Store labels

In [8]:
df.to_hdf('/mnt/md0/user/scheuererra68323/testset_jtt/JTT_Flawfinder_Labels.h5', key='JTT_Flawfinder_Labels')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->Index(['path', 'line', 'cwes', 'context'], dtype='object')]

  encoding=encoding,
