# Create dataset from real bugs extracted from GitHub
Here we go through the binary operations extracted from the commits of real bugs in GitHub. The goal is to creat
a dataset in a format expected by DeepBugs.

In [None]:
import os
from pathlib import Path
import codecs
import json
from typing import List, Dict, Any
import pandas as pd
from multiprocessing import Pool, cpu_count
from tqdm.notebook import trange, tqdm
benchmarks_dir = '../benchmarks'

real_bugs_dataset_file_path  = os.path.join(benchmarks_dir, 'binops_real_bugs.pkl')
real_bugs_dataset_dir  = os.path.join(benchmarks_dir, 'binops_real_bugs')

In [None]:
def read_json_file(json_file_path)->Dict:
    try:
        obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read()
        return json.loads(obj_text)
    except FileNotFoundError:
        print(
            "Please provide a correct file p. Eg. ./results/validated-conflicts.json")
        return {}
    except Exception as e:
        # Empty JSON file most likely due to abrupt killing of the process while writing
        # print (e)
        return {}

def read_dataset_given_files(extracted_data_files: List) -> pd.DataFrame:
    d = []
    with Pool(cpu_count()) as p:
        with tqdm(total=len(extracted_data_files)) as pbar:
            pbar.set_description_str(
                desc="Reading dataset from files", refresh=False)
            for i, each_vars in enumerate(
                    p.imap_unordered(read_json_file, extracted_data_files, 20)):
                pbar.update()
                d.extend(each_vars)
            p.close()
            p.join()
    extracted_dataset = pd.DataFrame(d)
    return extracted_dataset

def file_path_to_dataset(dataset_file_path, dir_path):
    if not Path(dataset_file_path).is_file():
        file_paths = list(Path(dir_path).rglob('*.json'))
        print(f"Number of files={len(file_paths)}")
        dataset = read_dataset_given_files(extracted_data_files=file_paths)
        print(f"Saving {dataset_file_path}")
        dataset.to_pickle(dataset_file_path,'gzip')
    else:
        print(f'Reading from {dataset_file_path}')
        dataset = pd.read_pickle(dataset_file_path,'gzip')
    print(f"Dataset contains {len(dataset)} examples")
    return dataset

In [None]:
def get_file_loc(row):
    d = row.to_dict()
    if 'benchmarks/real_bugs_github/buggy_' in d['src']:
        file_name = d['src'].replace('benchmarks/real_bugs_github/buggy_','')
    else:
        file_name = d['src'].replace('benchmarks/real_bugs_github/correct_','')
    range = str(d['range'])
    return file_name+'_'+range

In [None]:
dataset = file_path_to_dataset(dataset_file_path=real_bugs_dataset_file_path, dir_path=real_bugs_dataset_dir)
row_iter = [row for _, row in dataset.iterrows()]
locations = []
for row in tqdm(row_iter):
    loc = get_file_loc(row)
    locations.append(loc)
dataset['filename_loc'] = locations

In [None]:
dataset

In [None]:
correct_dataset = dataset[dataset['src'].apply(lambda x: 'correct_' in x)]
buggy_dataset = dataset[dataset['src'].apply(lambda x: 'buggy_' in x)]

In [None]:
merged = correct_dataset.merge(buggy_dataset,left_on='filename_loc', right_on='filename_loc', suffixes=['_CORRECT','_BUGGY'])
merged

In [None]:
def get_buggy_non_buggy_data(row):
    d = row.to_dict()
    correct = {k.replace('_CORRECT',''):v for k, v in d.items() if '_CORRECT' in k}
    correct['probability_that_incorrect'] = 0
    buggy = {k.replace('_BUGGY',''):v for k, v in d.items() if '_BUGGY' in k}
    buggy['probability_that_incorrect'] = 1
    if (correct['left'] != buggy['left'] or correct['right'] != buggy['right'] ) and correct['op'] == buggy['op'] :
        return [correct, buggy]
    else:
        return []

In [None]:
correct_bin_ops = []
buggy_bin_ops = []
x_y_pair_given = []
for _,row in tqdm(list(merged.iterrows()), desc='Get lines'):
    r = get_buggy_non_buggy_data(row)
    if len(r):
        correct_bin_ops.append(r[0])
        buggy_bin_ops.append(r[1])
        x_y_pair_given.append(r)
print(f'Number of buggy/correct binOps extracted are {len(correct_bin_ops)}')

In [None]:
print(len(x_y_pair_given))
filtered_x_y_pair = []
for pr in x_y_pair_given:
    if pr[0]['parent'] =='AwaitExpression' or 'AwaitExpression' == pr[0]['grandParent']:
        continue
    if pr[1]['parent'] =='AwaitExpression'or 'AwaitExpression' == pr[1]['grandParent']:
        continue
    filtered_x_y_pair.append(pr)
x_y_pair_given = filtered_x_y_pair
print(len(x_y_pair_given))

We give the buggy lines as input to a trained model in DeepBugs and check how many are actually classified as buggy.
Then we confirm them with the correct extracted binops.

In [None]:
def write_json(content, out_file):
    with open(out_file, 'w+') as f:
        json.dump(content, f)

# write_json(correct_bin_ops, os.path.join(benchmarks_dir, 'correct_real_binops.json'))
# write_json(buggy_bin_ops, os.path.join(benchmarks_dir, 'buggy_real_binops.json'))
write_json(x_y_pair_given, os.path.join(benchmarks_dir, 'correct_buggy_real_binops.json'))