In [1]:
import pandas as pd
import os

In [11]:
# Output file will end up in the path below
path_stem = "/Users/nandini/Desktop/CS230/CS230/stat_analysis/"

# Directory containing pylint output - download from https://drive.google.com/open?id=1DLiYPP594fUsGQsNAM5lU_b_-xQ2xxvg
pylint_path = path_stem + "Pylint_output/"

# Directory containing bandit output - download from https://drive.google.com/open?id=1lxTNIE0wMLc0Ixb6HhcQxgLReG_jTpA6
bandit_path = path_stem + "Bandit_output/"

# I. Build PyLint Dataframe

In [4]:
directory = os.fsencode(pylint_path) 
dic = dict()

for file in os.listdir(directory): 
    filename = os.fsdecode(file) 
    benchmark_name = filename.split('_pylint.txt')[0]
    full_path = pylint_path + filename
    df = pd.read_csv(full_path)
    dic[benchmark_name] = df.sum()
    
pylint_df = pd.DataFrame.from_dict(dic).T
pylint_df['TOTAL_PYLINT'] = pylint_df.sum(axis=1)

In [7]:
len(pylint_df)

385

In [8]:
pylint_df.head()

Unnamed: 0,I,R,C,W,E,F,TOTAL_PYLINT
Radicale-1.1.1,0.0,39.0,56.0,69.0,78.0,0.0,242.0
trio-websockets-0.2,0.0,0.0,44.0,9.0,41.0,0.0,94.0
zhmcclient-0.20.0,0.0,556.0,1521.0,1071.0,116.0,0.0,3264.0
ddtrace-0.10.1,0.0,55.0,600.0,191.0,69.0,0.0,915.0
ZopeSkel-2.10,0.0,30.0,389.0,97.0,17.0,0.0,533.0


# II. Build Bandit DataFrame

In [9]:
# List of bandit tests from https://github.com/PyCQA/bandit
test_ids = ["B101", "B102", "B103", "B104", "B105", "B106", "B107", "B108", "B109", "B110", "B111", "B112"]
test_ids += ["B201"]
test_ids += ["B301", "B302", "B303", "B304", "B305", "B306", "B307", "B308", "B309", "B310"]
test_ids += ["B311", "B312", "B313", "B314", "B315", "B316", "B317", "B318", "B319", "B320"]
test_ids += ["B321", "B322", "B323", "B324", "B325"]
test_ids += ["B401", "B402", "B403", "B404", "B405", "B406", "B407", "B408", "B409", "B410", "B411", "B412", "B413"]
test_ids += ["B501", "B502", "B503", "B504", "B505", "B506", "B507"]
test_ids += ["B601", "B602", "B603", "B604", "B605", "B606", "B607", "B608", "B609", "B610", "B611"]
test_ids += ["B701", "B702", "B703"]

In [15]:
directory = os.fsencode(bandit_path) 
dic = dict()

for file in os.listdir(directory): 
    filename = os.fsdecode(file) 
    print(filename)
    benchmark_name = filename.split('_bandit.txt')[0]
    full_path = bandit_path + filename
    df = pd.read_csv(full_path)
    inner_dic = dict()

    # Filter out the low confidence issues
    df = df[df['issue_confidence'] != 'LOW']
    TOTAL_BANDIT = len(df)
    inner_dic['TOTAL_BANDIT'] = TOTAL_BANDIT

    # Count number of issues in each severity category
    HIGH_SEVERITY = len(df[df['issue_severity'] == 'HIGH'])
    MED_SEVERITY = len(df[df['issue_severity'] == 'MEDIUM'])
    LOW_SEVERITY = len(df[df['issue_severity'] == 'LOW'])
    inner_dic['HIGH_SEVERITY'] = HIGH_SEVERITY
    inner_dic['MED_SEVERITY'] = MED_SEVERITY
    inner_dic['LOW_SEVERITY'] = LOW_SEVERITY

    # Count the number of issues in each test category
    id_sum = 0
    for test_id in test_ids:
        id_count = len(df[df['test_id']==test_id])
        inner_dic[test_id] = id_count
        id_sum += id_count
    if (id_sum != TOTAL_BANDIT):
        print("ERROR: Script didn't catch all Bandit tests in" + benchmark_name)
        print("Number of issues caught: " + id_sum)
        print("Number of issues expected: " + TOTAL_BANDIT)
        print("==================================================================")
    
    # Append to dictionary of files 
    dic[benchmark_name] = inner_dic
    
bandit_df = pd.DataFrame.from_dict(dic).T

In [17]:
len(bandit_df)

491

In [16]:
bandit_df.head()

Unnamed: 0,B101,B102,B103,B104,B105,B106,B107,B108,B109,B110,...,B609,B610,B611,B701,B702,B703,HIGH_SEVERITY,LOW_SEVERITY,MED_SEVERITY,TOTAL_BANDIT
restkit-4.2.2,30,2,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,45,7,53
streamsx.objectstorage-1.1.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
recurly-2.6.2,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,9,15,24
peewee-2.9.2,4,1,0,0,4,0,0,0,0,1,...,0,0,0,0,0,0,4,13,8,25
drf-tracking-1.2.0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,7,0,7


# III. Combine Pylint and Bandit Dataframes

In [61]:
combined_df = pylint_df.join(bandit_df).sort_index()
combined_df.head()

Unnamed: 0,I,R,C,W,E,F,TOTAL_PYLINT,B101,B102,B103,...,B609,B610,B611,B701,B702,B703,HIGH_SEVERITY,LOW_SEVERITY,MED_SEVERITY,TOTAL_BANDIT
AuthBWC-0.1.3,0.0,27.0,628.0,65.0,147.0,0.0,867.0,253,0,0,...,0,0,0,0,0,0,0,258,0,258
Beaker-0.9.3,0.0,34.0,443.0,49.0,48.0,0.0,574.0,1,0,0,...,0,0,0,0,0,0,1,9,5,15
CLAM-0.9.9.1,31.0,130.0,2990.0,1665.0,88.0,0.0,4904.0,32,2,1,...,0,0,0,0,0,0,24,65,81,170
CairoSVG-1.0.20,0.0,33.0,250.0,20.0,4.0,0.0,307.0,0,0,0,...,0,0,0,0,0,0,0,3,5,8
CherryMusic-0.35.2,0.0,185.0,1494.0,942.0,280.0,0.0,2901.0,130,0,2,...,0,0,0,0,0,0,3,148,12,163


In [62]:
len(combined_df)

385

# IV. Add a column for file size

In [63]:
benchmark_size = pd.read_csv(path_stem + "benchmark_size.csv", header=None).set_index(0).rename({1:'BENCHMARK_SIZE'}, axis='columns')
benchmark_size.head()

Unnamed: 0_level_0,BENCHMARK_SIZE
0,Unnamed: 1_level_1
aiida-core-0.12.2,9256
aiocouchdb-0.5.0,412
aiohttp-0.16.2,8316
aioli-0.0.4,72
aldryn-django-1.6.11.1,128


In [65]:
combined_df = combined_df.join(benchmark_size)
combined_df.head()

Unnamed: 0,I,R,C,W,E,F,TOTAL_PYLINT,B101,B102,B103,...,B610,B611,B701,B702,B703,HIGH_SEVERITY,LOW_SEVERITY,MED_SEVERITY,TOTAL_BANDIT,BENCHMARK_SIZE
AuthBWC-0.1.3,0.0,27.0,628.0,65.0,147.0,0.0,867.0,253,0,0,...,0,0,0,0,0,0,258,0,258,336
Beaker-0.9.3,0.0,34.0,443.0,49.0,48.0,0.0,574.0,1,0,0,...,0,0,0,0,0,1,9,5,15,192
CLAM-0.9.9.1,31.0,130.0,2990.0,1665.0,88.0,0.0,4904.0,32,2,1,...,0,0,0,0,0,24,65,81,170,1844
CairoSVG-1.0.20,0.0,33.0,250.0,20.0,4.0,0.0,307.0,0,0,0,...,0,0,0,0,0,0,3,5,8,200
CherryMusic-0.35.2,0.0,185.0,1494.0,942.0,280.0,0.0,2901.0,130,0,2,...,0,0,0,0,0,3,148,12,163,3392


In [66]:
len(combined_df)

385

# V. Export the Combined Dataframe

In [67]:
combined_df.to_csv(path_stem + "combined_df.txt")