# Create dataset for DeepBugs for wrong assignment bugs
---

In [None]:
import pandas as pd
from pathlib import Path
from multiprocessing import Pool, cpu_count
from typing import List, Dict, Union
from tqdm.notebook import trange, tqdm
import json
import codecs
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
import os
from collections import Counter

benchmark_dir = '../benchmarks'

# Path to the dataset files where bugs were seeded
data_path = os.path.join(benchmark_dir,'assignments_data.pkl')
data_dir = os.path.join(benchmark_dir, 'assignments_data')

# Path to the files after seeding the bugs
wrong_assignment_seeded_path = os.path.join(benchmark_dir,'assignments_wrong.pkl')
wrong_assignment_seeded_dir = os.path.join(benchmark_dir, 'assignments_wrong')

In [None]:
def read_json_file(json_file_path)->Dict:
    try:
        obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read()
        return json.loads(obj_text)
    except FileNotFoundError:
        print(
            f"{json_file_path} not found, provide a correct file path")
        return {}
    except Exception as e:
        # Empty JSON file most likely due to abrupt killing of the process while writing
        # print (e)
        return {}

def read_dataset_given_files(extracted_data_files: List) -> pd.DataFrame:
    d = []
    with Pool(cpu_count()) as p:
        with tqdm(total=len(extracted_data_files)) as pbar:
            pbar.set_description_str(
                desc="Reading dataset from files", refresh=False)
            for i, each_vars in enumerate(
                    p.imap_unordered(read_json_file, extracted_data_files, 20)):
                pbar.update()
                d.extend(each_vars)
            p.close()
            p.join()
    extracted_dataset = pd.DataFrame(d)
    return extracted_dataset

def file_path_to_dataset(dataset_file_path, dir_path):
    if not Path(dataset_file_path).is_file():
        file_paths = list(Path(dir_path).rglob('*.json'))
        # Debug
        print(f"Number of files={len(file_paths)}")
        dataset = read_dataset_given_files(extracted_data_files=file_paths)
        print(f"Saving {dataset_file_path}")
        dataset.to_pickle(dataset_file_path,'gzip')
    else:
        print(f'Reading from {dataset_file_path}')
        dataset = pd.read_pickle(dataset_file_path,'gzip')
    print(f"Dataset contains {len(dataset)} examples")
    return dataset


In [None]:
wrong_assignment_seeded = file_path_to_dataset(dataset_file_path=wrong_assignment_seeded_path, dir_path=wrong_assignment_seeded_dir)
wrong_assignment_seeded.rename(columns={"src": "file"}, inplace=True)
assignments_data = file_path_to_dataset(dataset_file_path=data_path, dir_path=data_dir)


In [None]:
wrong_assignment_seeded_loc_path = wrong_assignment_seeded_path.replace('.pkl','_withloc.pkl')

In [None]:
def read_file_content(file_path: Path) -> Union[List, Dict]:
    content = []
    try:
        with codecs.open(str(file_path), 'r', encoding='utf-8') as f:
            c = f.read()
            content = json.loads(c)
    except FileNotFoundError:
        pass
    except ValueError:
        pass
    return content

def get_location_of_seeded(row):
    analysed_location=row['file']
    bug_seeding_metadata = read_file_content('../'+analysed_location.split(' :')[0] + 'on')
    file_name = bug_seeding_metadata['file_name_where_intended']
    line = bug_seeding_metadata['target_line_range']['line'].split('-')

    # Represents the range of the source and not of the seeded bug
    rng_data = bug_seeding_metadata['target_line_range']['range']
    line = ' - '.join(line)
    location_seeded_bug = file_name + ' : ' + line

    return location_seeded_bug, rng_data
    # no extra characters were added/deleted to seed the bug
    #if row['range'] == rng_seeded:
    #     return location_seeded_bug, rng_seeded
    #else:
    #    return location_seeded_bug, rng_seeded

if not Path(wrong_assignment_seeded_loc_path).is_file():
    rows_iter = (row for _, row in wrong_assignment_seeded.iterrows())
    locations = []
    ranges_source = []
    with Pool(cpu_count()) as p:
        with tqdm(total=len(wrong_assignment_seeded)) as pbar:
            pbar.set_description_str(
                    desc="Getting locations", refresh=False)
            for i, rt in enumerate(p.map(get_location_of_seeded, rows_iter, 10)):
                loc, ranges_src = rt
                locations.append(loc)
                ranges_source.append(ranges_src)
                pbar.update()
            p.close()
            p.join()
    wrong_assignment_seeded['src'] = locations
    wrong_assignment_seeded['range'] = ranges_source
    print(f'Saving to {wrong_assignment_seeded_loc_path}')
    wrong_assignment_seeded.to_pickle(wrong_assignment_seeded_loc_path, 'gzip')


In [None]:
wrong_assignment_seeded_loc=pd.read_pickle(wrong_assignment_seeded_loc_path, 'gzip')
#print(get_location_of_seeded(wrong_assignment_seeded_loc.iloc[0]))
print(f'Size is {len(wrong_assignment_seeded_loc)}')


In [None]:
print(wrong_assignment_seeded_loc.iloc[10,6])
print(wrong_assignment_seeded_loc.iloc[10,8])

## Create the dataset for DeepBugs
Map the location from seeded bugs to the original files.

In [None]:
merged_out_path = os.path.join(benchmark_dir,'dataset_for_deepbugs_wrong_assignments.pkl')

In [None]:
assignments_data['range_str'] = assignments_data['range'].apply(lambda x: str(x))
wrong_assignment_seeded_loc['range_str'] = wrong_assignment_seeded_loc['range'].apply(lambda x: str(x))

In [None]:
def get_correspondig_buggy_row(row):
    same_locs = assignments_data[assignments_data['src']==row['src']]
    for _, data_row in same_locs.iterrows():
        if data_row['range_str'] == row['range_str']:
            if (str(row['lhs'])+str(row['rhs'])) != (str(data_row['lhs'])+str(data_row['rhs'])):
                return data_row.name
    return -1


In [None]:
corresponding_row_file_path = os.path.join(benchmark_dir, 'wrong_assignment_correct_rows.json')
cor_row = []

if not Path(corresponding_row_file_path).is_file():
    rows_iter = [row for _, row in wrong_assignment_seeded_loc.iterrows()]

    with Pool(cpu_count()//2) as p:
        with tqdm(total=len(rows_iter)) as pbar:
            pbar.set_description_str(desc="Extracting location", refresh=False)
            for _, rw_num in enumerate(p.map(get_correspondig_buggy_row, rows_iter)):
                cor_row.append(rw_num)
                pbar.update()
            p.close()
            p.join()

    with open(corresponding_row_file_path, 'w+') as f:
        json.dump(cor_row, f)
else:
    with open(corresponding_row_file_path, 'r') as f:
        print(f'Reading {f.name}')
        cor_row = json.load(f)


In [None]:
wrong_assignment_seeded_loc['corrsp_row'] = cor_row
wrong_assignment_seeded_loc = wrong_assignment_seeded_loc[wrong_assignment_seeded_loc['corrsp_row']!=-1]
len(wrong_assignment_seeded_loc)

In [None]:
wrong_assignment_seeded_loc = wrong_assignment_seeded_loc.drop(columns=['range_str'])

In [None]:
wrong_assignment_seeded_loc['probability_that_incorrect'] = 1

In [None]:
non_buggy_rows = []
rows_iter = [row for _, row in wrong_assignment_seeded_loc.iterrows()]
for rw in tqdm(rows_iter):
    dr = rw['corrsp_row']
    # Get the corresponding row from 'data'
    r = assignments_data.iloc[dr].to_dict()
    r['probability_that_incorrect'] = 0
    r['file'] = rw['file']
    non_buggy_rows.append(r)
non_buggy = pd.DataFrame(non_buggy_rows)


In [None]:
buggy = wrong_assignment_seeded_loc.drop(columns= ['corrsp_row'])

In [None]:
merged = pd.concat([non_buggy, buggy], ignore_index=True)
print(f"Size of dataset={len(merged)}")
print(f"Writing to {merged_out_path}")
merged.to_pickle(merged_out_path, 'gzip')

In [None]:
merged = pd.read_pickle(merged_out_path,'gzip')
buggy = merged[merged['probability_that_incorrect']==1]
non_buggy = merged[merged['probability_that_incorrect']==0]

In [None]:
print(non_buggy.iloc[10])
print(buggy.iloc[10])

In [None]:
buggy_iter = [row for _, row in buggy.iterrows()]
nbuggy_iter = [row for _, row in non_buggy.iterrows()]

dataset = []
for bg, nbg in tqdm(zip(buggy_iter, nbuggy_iter), desc='creating dataset', total=len(buggy_iter)):
    dataset.append([bg.to_dict(), nbg.to_dict()])

In [None]:
dataset_df = pd.DataFrame([b_o_n for td in dataset for b_o_n in td])
dataset_df = dataset_df.drop(columns=['range_str'])
dataset_df

In [None]:
dataset_df['seeding_url']=dataset_df['file'].apply(lambda x: read_json_file('../'+x.split(':')[0].lstrip().rstrip()+'on')['seeding_pattern_url'])

In [None]:
dataset_df.to_pickle(merged_out_path, 'gzip')

In [None]:
def write_json(content, out_file):
    with open(out_file, 'w+') as f:
        print(f'Writing to {f.name}')
        json.dump(content, f)

In [None]:
write_json(dataset,os.path.join(benchmark_dir, 'full_dataset_wrong_assignment.json'))

# Select only those seeded bugs that are present in the training patterns

Use only the 'training' change patterns as mentioned in the paper. First read all
change patterns and then split it 80-20.

Next, select only those that conform our change pattern selection.

In [None]:
all_change_patterns = read_json_file(os.path.join(benchmark_dir, 'bug_seeding_patterns_for_semantic_seeding.json'))
print(f'Found {len(all_change_patterns)} patterns')

In [None]:
l_len = len(all_change_patterns)*80 // 100
tr_patterns, val_patterns = all_change_patterns[:l_len], all_change_patterns[l_len:]
print(f'Number of training patterns = {len(tr_patterns)}, Number of validation patterns = {len(val_patterns)}')

We do not use the validation patterns here. Rather we will use them as examples of real bugs that *DeepBugs*
will try to find.

So now select only those seeded bugs that has been seeded using *url* present in the training patterns.

In [None]:
tr_patterns = pd.DataFrame(tr_patterns)
tr_urls = set(tr_patterns['url'])
print(f'There exists {len(tr_urls)} unique urls')

In [None]:
merged = pd.read_pickle(merged_out_path,'gzip')
buggy = merged[merged['probability_that_incorrect']==1]
non_buggy = merged[merged['probability_that_incorrect']==0]

buggy_iter = [row for _, row in buggy.iterrows()]
nbuggy_iter = [row for _, row in non_buggy.iterrows()]

dataset = []
for bg, nbg in tqdm(zip(buggy_iter, nbuggy_iter), desc='creating dataset only from training', total=len(buggy_iter)):
    if bg['seeding_url'] in tr_urls and nbg['seeding_url'] in tr_urls:
        dataset.append([bg.to_dict(), nbg.to_dict()])

Now write the filtered dataset

In [None]:
print(f'There contains {len(dataset)*2} examples after filtering. The original contained {len(dataset_df)} examples')
write_json(dataset,os.path.join(benchmark_dir, 'full_dataset_wrong_assignment.json'))

### For artificial seeding during DeepBugs training, we use the complete dataset

In [None]:
complete_dataset_no_seeded_included=[]
for _, rw in tqdm(assignments_data.iterrows(), total=len(assignments_data)):
    row = rw.to_dict()
    complete_dataset_no_seeded_included.append(row)

In [None]:
write_json(complete_dataset_no_seeded_included,os.path.join(benchmark_dir, 'full_dataset_wrong_assignment_no_seeded_included.json'))

## Estimate the number of files where bugs were not seeded
This is only needed if I need to re-run bug seeding

In [None]:
files_containing_assignments = set(assignments_data['src'].apply(lambda x: x.split(':')[0].lstrip().rstrip()))
files_already_seeded = set(dataset_df['src'].apply(lambda x: x.split(':')[0].lstrip().rstrip()))
files_not_seeded = list(files_containing_assignments - files_already_seeded)
with open(os.path.join(benchmark_dir,'files_containing_assignments_not_seeded.json'),'w+') as f:
    print(f'Writing to {f.name}')
    json.dump(files_not_seeded,f)

In [None]:
print(f'Files containing assignments {len(files_containing_assignments)}')
print(f'Files already seeded {len(files_already_seeded)}')
print(f'Files not yet seeded {len(files_not_seeded)}')