# Create Dataset for DeepBugs wrong binary operand
---

In [None]:
import pandas as pd
from pathlib import Path
from multiprocessing import Pool, cpu_count
from typing import List, Dict, Union
from tqdm.notebook import trange, tqdm
import json
import codecs
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy as np
import os
from collections import Counter

benchmark_dir = '../benchmarks'

# Path to the dataset files where bugs were seeded
# data_path = os.path.join(benchmark_dir,'binOps_data.pkl')
data_path = os.path.join(benchmark_dir,'binOps_data.pkl')
data_dir = os.path.join(benchmark_dir, 'binOps_data')

# Path to the files after seeding the bugs
wrong_binary_operand_path = os.path.join(benchmark_dir,'binOps_wrong_operand.pkl')
wrong_binary_operand_dir = os.path.join(benchmark_dir, 'binOps_wrong_operand')

In [None]:
def read_json_file(json_file_path)->Dict:
    try:
        obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read()
        r = json.loads(obj_text)
        return r
    except FileNotFoundError:
        print(
            "Please provide a correct file p. Eg. ./results/validated-conflicts.json")
        return {}
    except Exception as e:
        # Empty JSON file most likely due to abrupt killing of the process while writing
        # print (e)
        return {}

def read_dataset_given_files(extracted_data_files: List) -> pd.DataFrame:
    d = []
    with Pool(cpu_count()) as p:
        with tqdm(total=len(extracted_data_files)) as pbar:
            pbar.set_description_str(
                desc="Reading dataset from files", refresh=False)
            for i, each_vars in enumerate(
                    p.imap_unordered(read_json_file, extracted_data_files, 20)):
                pbar.update()
                d.extend(each_vars)
            p.close()
            p.join()
    extracted_dataset = pd.DataFrame(d)
    return extracted_dataset

def file_path_to_dataset(dataset_file_path, dir_path):
    if not Path(dataset_file_path).is_file():
        file_paths = list(Path(dir_path).rglob('*.json'))
        print(f"Number of files={len(file_paths)}")
        dataset = read_dataset_given_files(extracted_data_files=file_paths)
        print(f"Saving {dataset_file_path}")
        dataset.to_pickle(dataset_file_path,'gzip')
    else:
        print(f'Reading from {dataset_file_path}')
        dataset = pd.read_pickle(dataset_file_path,'gzip')
    print(f"Dataset contains {len(dataset)} examples")
    return dataset

In [None]:
wrongbinOpndSeeded = file_path_to_dataset(dataset_file_path=wrong_binary_operand_path, dir_path=wrong_binary_operand_dir)
wrongbinOpndSeeded.rename(columns={"src": "file"}, inplace=True)
binopData = file_path_to_dataset(dataset_file_path=data_path, dir_path=data_dir)

In [None]:
binopData

#### Process the seeded bugs to extract the location of seeding

In [None]:
wrong_binary_operand_loc_path = wrong_binary_operand_path.replace('.pkl','_withloc.pkl')

In [None]:
def read_file_content(file_path: Path) -> Union[List, Dict]:
    content = []
    try:
        with codecs.open(str(file_path), 'r', encoding='utf-8') as f:
            c = f.read()
            content = json.loads(c)
    except FileNotFoundError:
        print(f'Not found {file_path} ')
        pass
    except ValueError:
        pass
    return content

def get_location_of_seeded(row):
    analysed_location=row['file']
    bug_seeding_metadata = read_file_content('../'+analysed_location.split(' :')[0] + 'on')
    file_name = bug_seeding_metadata['file_name_where_intended']
    line = bug_seeding_metadata['target_line_range']['line'].split('-')
    
    # Represents the range of the source and not of the seeded bug
    rng_data = bug_seeding_metadata['target_line_range']['range'] 
    line = ' - '.join(line)
    location_seeded_bug = file_name + ' : ' + line
    
    return location_seeded_bug, rng_data
    # no extra characters were added/deleted to seed the bug 
    #if row['range'] == rng_seeded: 
    #     return location_seeded_bug, rng_seeded
    #else:
    #    return location_seeded_bug, rng_seeded

if not Path(wrong_binary_operand_loc_path).is_file():
    rows_iter = (row for _, row in wrongbinOpndSeeded.iterrows())
    locations = []
    ranges_source = []
    with Pool(cpu_count()) as p:
        with tqdm(total=len(wrongbinOpndSeeded)) as pbar:
            pbar.set_description_str(
                    desc="Getting locations", refresh=False)
            for i, rt in enumerate(p.map(get_location_of_seeded, rows_iter, 10)):
                loc, ranges_src = rt
                locations.append(loc)
                ranges_source.append(ranges_src)
                pbar.update()
            p.close()
            p.join()
    wrongbinOpndSeeded['src'] = locations
    wrongbinOpndSeeded['range'] = ranges_source
    print(f'Saving to {wrong_binary_operand_loc_path}')
    wrongbinOpndSeeded.to_pickle(wrong_binary_operand_loc_path, 'gzip')

In [None]:
wrongbinOpndSeeded_loc=pd.read_pickle(wrong_binary_operand_loc_path, 'gzip')

In [None]:
wrongbinOpndSeeded_loc

## Create the dataset for DeepBugs

Map the location from seeded bugs to the original files.
First remove the duplicates from both datasets and then merge.

In [None]:
merged_out_path = os.path.join(benchmark_dir,'dataset_for_deepbugs_binOpnd4.pkl')

In [None]:
binopData['range_str'] = binopData['range'].apply(lambda x: str(x))
wrongbinOpndSeeded_loc['range_str'] = wrongbinOpndSeeded_loc['range'].apply(lambda x: str(x))

In [None]:
# wrongbinOpndSeeded_loc = wrongbinOpndSeeded_loc.drop(columns=['file'])

### Get the corresponding row from targer files where bug was seeded 

For each seeded bug, find the exact line and range from the extracted data.
Unfortunately, we can't always find the exact location because of formatting issues.
We have tried all possible ways to format but for some cases, there is no way to have an exact formatting.
This offsets the line and range in one of the file and it is not possible to pinpoint the exact location where
the bug was seeded. As a result, we loose a lot of seeded bugs.

In [None]:
def get_correspondig_buggy_row(row):
    same_locs = binopData[binopData['src']==row['src']]
    for _, data_row in same_locs.iterrows():
        if data_row['range_str'] == row['range_str']:
            if (str(row['left'])+str(row['right'])) != (str(data_row['left'])+str(data_row['right'])) and row['op']==data_row['op']:
                return data_row.name
    return -1

In [None]:
corresponding_row_file_path = os.path.join(benchmark_dir, 'binOpnd4_correct_rows.json')
cor_row = []

if not Path(corresponding_row_file_path).is_file():
    rows_iter = [row for _, row in wrongbinOpndSeeded_loc.iterrows()]

    with Pool(cpu_count()//3) as p:
        with tqdm(total=len(rows_iter)) as pbar:
            pbar.set_description_str(desc="Extracting location", refresh=False)
            for _, rw_num in enumerate(p.map(get_correspondig_buggy_row, rows_iter)):
                cor_row.append(rw_num)
                pbar.update()
            p.close()
            p.join()

    with open(corresponding_row_file_path, 'w+') as f:
        json.dump(cor_row, f)
else:
    with open(corresponding_row_file_path, 'r') as f:
        cor_row = json.load(f)

In [None]:
wrongbinOpndSeeded_loc['corrsp_row'] = cor_row

In [None]:
wrongbinOpndSeeded_loc = wrongbinOpndSeeded_loc[wrongbinOpndSeeded_loc['corrsp_row']!=-1]

In [None]:
wrongbinOpndSeeded_loc = wrongbinOpndSeeded_loc.drop(columns=['range_str'])

In [None]:
wrongbinOpndSeeded_loc['probability_that_incorrect'] = 1

In [None]:
non_buggy_rows = []
rows_iter = [row for _, row in wrongbinOpndSeeded_loc.iterrows()]
for rw in tqdm(rows_iter):
    dr = rw['corrsp_row']
    r = binopData.iloc[dr].to_dict()
    r['probability_that_incorrect'] = 0
    r['file'] = rw['file']
    non_buggy_rows.append(r)

In [None]:
wrongbinOpndSeeded_loc

In [None]:
# merged_dataset = pd.read_pickle(merged_out_path,'gzip')

In [None]:
merged_dataset

In [None]:
non_buggy = pd.DataFrame(non_buggy_rows)
non_buggy = non_buggy.drop(columns=['range_str'])
non_buggy

In [None]:
buggy = wrongbinOpndSeeded_loc.drop(columns= ['corrsp_row'])
merged = pd.concat([non_buggy, buggy], ignore_index=True)
print(f"Size of dataset={len(merged)}")
print(f"Writing to {merged_out_path}")
merged.to_pickle(merged_out_path, 'gzip')

#### Now create training and validation datasets

Deepbugs expects the training and validation datasets to be '.json' files. First split the merged dataset and then create the required datasets.

The dataset format looks like the following
```js
[
    [
        { // non-buggy
          "left": "ID:g",
          "right": "LIT:67",
          "op": ">",
          "leftType": "unknown",
          "rightType": "number",
          "parent": "IfStatement",
          "grandParent": "BlockStatement",
          "src": "benchmarks/data/data/1.js : 6 - 6",
          "probability_that_incorrect": 0
        },
        { // buggy
          "left": "ID:g",
          "right": "LIT:67",
          "op": ">=",
          "leftType": "unknown",
          "rightType": "number",
          "parent": "IfStatement",
          "grandParent": "BlockStatement",
          "src": "benchmarks/js_benchmark_seeded_bugs/1_SEMSEED_MUTATED_1.js : 6 - 6",
          "probability_that_incorrect": 1
        }
    ],
    [  ]
 ]
```

In [None]:
merged = pd.read_pickle(merged_out_path,'gzip')
buggy = merged[merged['probability_that_incorrect']==1]
non_buggy = merged[merged['probability_that_incorrect']==0]

In [None]:
print(f'Buggy={len(buggy)}, Non-buggy={len(non_buggy)}')


In [None]:
buggy.iloc[12220]

In [None]:
non_buggy.iloc[12220]

In [None]:
buggy_iter = [row for _, row in buggy.iterrows()]
nbuggy_iter = [row for _, row in non_buggy.iterrows()]

dataset = []
for bg, nbg in tqdm(zip(buggy_iter, nbuggy_iter), desc='creating dataset', total=len(buggy_iter)):
    dataset.append([bg.to_dict(), nbg.to_dict()])

In [None]:
dataset[12220]

Split into training and validation dataset

# We only use training change pattern during seeding, so no need to filter

In [None]:
def write_json(content, out_file):
    with open(out_file, 'w+') as f:
        print(f'Writing to {f.name}')
        json.dump(content, f)

In [None]:
print(f'Size of dataset={len(dataset)}')
write_json(dataset,os.path.join(benchmark_dir, 'full_dataset_wrong_binopnd4.json'))

In [None]:
dataset_df = pd.DataFrame([b_o_n for td in dataset for b_o_n in td])
dataset_df.to_pickle(merged_out_path, 'gzip')

# Select only those seeded bugs that are present in the training patterns

Use only the 'training' change patterns as mentioned in the paper. First read all
change patterns and then split it 80-20.

Next, select only those that conform our change pattern selection.

In [None]:
def get_only_idf_lit_containing_patterns(all_changes):
    """
    It is possible that every bug-fix pattern can not be used to seed bugs.
    We filter some of them here. For example:
        * we may filter very long change patterns (although we do it once while aggregating data from MongoDB)
        * we may select only those chage patterns that has atleast 'N' frequency
    """
    filtered_change_patterns = []
    for t in all_changes:
        # If the change pattern contains at-least one Identifier/Literal, we use that.
        # Else the change pattern is discarded
        if 'Idf_' in ' '.join(t['fix']) or 'Idf_' in ' '.join(t['buggy']) or 'Lit_' in ' '.join(
                t['fix']) or 'Lit_' in ' '.join(t['buggy']):
            filtered_change_patterns.append(t)

    return filtered_change_patterns

In [None]:
all_change_patterns = read_json_file(os.path.join(benchmark_dir, 'bug_seeding_patterns_for_semantic_seeding.json'))
all_change_patterns = get_only_idf_lit_containing_patterns(all_change_patterns)
print(f'Found {len(all_change_patterns)} patterns')

In [None]:
l_len = len(all_change_patterns)*80 // 100
tr_patterns, val_patterns = all_change_patterns[:l_len], all_change_patterns[l_len:]
print(f'Number of training patterns = {len(tr_patterns)}, Number of validation patterns = {len(val_patterns)}')

We do not use the validation patterns here. Rather we will use them as examples of real bugs that *DeepBugs*
will try to find.

So now select only those seeded bugs that has been seeded using *url* present in the training patterns.

### Create dataset where DeepBugs will seed artificial bugs


The previous dataset includes both the correct and the seeded bugs. Now, we discard the seeded bugs.

In [None]:
complete_dataset_no_seeded_included=[]
for _, rw in tqdm(binopData.iterrows(), total=len(binopData)):
    row = rw.to_dict()
    complete_dataset_no_seeded_included.append(row)

In [None]:
write_json(complete_dataset_no_seeded_included,os.path.join(benchmark_dir, 'full_dataset_wrong_binOpnd_no_seeded_included.json'))

## Create a combined dataset

In [None]:
combined_dataset = list(complete_dataset_no_seeded_included)
for d in tqdm(dataset, desc='Creating combined dataset'):
    combined_dataset.append(d[0])
    # combined_dataset.append(d[1])

In [None]:
print(f'Length of combined = {len(combined_dataset)}')
write_json(combined_dataset,os.path.join(benchmark_dir, 'full_dataset_wrong_binOpnd_combined.json'))

In [None]:
print(f"No of training, validation examples {len(tr_df_no_seeded_included)},{len(vl_df_no_seeded_included)} and full dataset {len(dataset_no_seeded_included)}")

In [None]:
sample = binopData.sample(10)
sample

In [None]:
sample['src'].apply(lambda x: x.split(':')[0].lstrip().rstrip())