In [11]:
"""

Created on 21-April-2020
@author Jibesh Patra

"""
import argparse
import os
import sys
from multiprocessing import cpu_count
from pathlib import Path
import torch
from torch.utils.data import DataLoader

### This Jupyter Notebook Specific Setup

The following configuration is meant only for running this Jupyter notebook. One may use _run_classification.py_ to 

In [12]:
running_as_notebook = False
root_dir = './'
try:
    cfg = get_ipython().config
    running_as_notebook = True
except NameError:
    pass
if running_as_notebook:
    from collections import namedtuple
    cur_dir = !pwd
    root_dir = '/'.join(cur_dir[0].split('/')[:-2])
    args = {
        'batch_size': 128,
        'num_epochs': 15,
        'train': True,
        'pos_dataset': f'{root_dir}/results/positive_examples.pkl',
        'neg_dataset': f'{root_dir}/results/negative_examples.pkl',
        'test': False,
        'test_dataset': f'{root_dir}/results/test_examples.pkl',
        'saved_model': f'{root_dir}/results/saved_models/VarValueClassifierRNN_all_types_17-11-2020--19:06:51_0.89.pt',
        'name': 'nalin',
        'ablation': [] # Possible values --> 'value_as_one_hot', 'var', 'type', 'len', 'shape'
    }
    results_dir = f'{root_dir}/results'
    token_embedding_path = f'{root_dir}/benchmark/python_embeddings.bin'
    positive_examples_dir = f'{root_dir}/results/dynamic_analysis_outputs'
    list_of_types_in_dataset_out_file = f'{root_dir}/results/list_of_types_in_dataset.json'
    Args = namedtuple('Args', args)
    args = Args(**args)
else:
    from command_line_args import get_parsed_args
    args = get_parsed_args(argparse=argparse)
    positive_examples_dir = 'results/dynamic_analysis_outputs'
    token_embedding_path = 'benchmark/python_embeddings.bin'
    list_of_types_in_dataset_out_file = 'results/list_of_types_in_dataset.json'
    results_dir = 'results'

### Dataset utilities

In [13]:
from dataset_utils.data_transformers.AblationTransformer import AblationTransformer
from dataset_utils.data_transformers.ResizeData import ResizeData
from dataset_utils.data_transformers.ValueToCharSequence import ValueToCharSequence
from dataset_utils.data_transformers.fastTextEmbeddingOfVarName import fastTextEmbeddingOfVarName
from dataset_utils.data_transformers.RepresentLen import RepresentLen
from dataset_utils.data_transformers.RepresentShape import RepresentShape
from dataset_utils.data_transformers.OneHotEncodingOfTypes import OneHotEncodingOfType
from dataset_utils.pre_process_dataset import process, write_types_and_frequencies
from read_dataset import get_training_val_dataset, get_test_dataset

### Models

In [14]:
from models.VarValueClassifierRNN import VarValueClassifierRNN

### Configurations

In [15]:
train, test = args.train, args.test
if not train and not test:
    print('Either "training" or "testing" is required')
    sys.exit(1)

batch_size = args.batch_size
num_epochs = args.num_epochs
max_num_of_chars_in_value = 100  # Number of characters in the value part of the assignment
print(f"-- Resizing the values to {max_num_of_chars_in_value} characters during training")

# You may specify your name
if args.name:
    model_name_suffix = args.name
else:
    model_name_suffix = 'Nalin'

model_name = f'RNNClassifier_{model_name_suffix}'


pos_dataset_file_path = args.pos_dataset
neg_dataset_file_path = args.neg_dataset
test_dataset_file_path = args.test_dataset

"""
There are three heuristics for generating negative examples:
    1. use_dimension: refers to computing various properties on the positive examples and then using them to
    generate the negative examples. (Code adapted from the initial code by MP)
    2. random: only useful for cases when the data contains single type (eg.string). The approach is simply randomizes the
    values. The idea is to check if certain idenfiers such as URL are only assigned values having certain properties
    3. weighted_random: This is the default strategy. Refer to the code where it is implemented for further details.
"""
heuristics_for_generating_negative_examples = ['random','weighted_random'][1]

# Types and the corresponding frequency in the dataset
"""
Pre-process dataset. This is an one time task ==>
    - Remove empty/malformed extracted data
    - Create negative examples
    - Create labels for the extracted data (label -> probability of buggy)
"""
if not test:
    process(positive_examples_dir=positive_examples_dir,
            positive_example_out_file_path=pos_dataset_file_path,
            negative_example_out_file_path=neg_dataset_file_path,
            test_example_out_file_path=test_dataset_file_path,
            heuristics_for_generating_negative_examples=heuristics_for_generating_negative_examples)
    write_types_and_frequencies(positive_example_out_file_path=pos_dataset_file_path,
                                list_of_types_in_dataset_out_file=list_of_types_in_dataset_out_file)

# Embeddings have been learned from ALL python files in the benchmark (~1M files). We could
# successfully extract assignments from some of these python files.

if not os.path.exists(token_embedding_path):
    print(f'Could not read from {token_embedding_path}. \nNeed an embedding path to continue')
    sys.exit(1)
test_examples_dir = 'results/test_examples'
saved_model_path = None
if args.test and args.saved_model:
    saved_model_path = args.saved_model
elif args.test and not args.saved_model:
    print("A saved model path is needed")
    sys.exit(1)
embedding_dim = 0
features_to_ablate = args.ablation


# Workaround for debugging on a laptop. Change with the cpu_count of your machine if required for debugging data loading
# else leave it alone
if cpu_count() > 20:
    num_workers_for_data_loading = cpu_count()
else:
    num_workers_for_data_loading = 0
config = {"num_workers": num_workers_for_data_loading, "pin_memory": True}

device = torch.device(
    'cuda:0' if torch.cuda.is_available() else 'cpu')

# Initialize model and model specific dataset data_transformers
print(f"\n{'-' * 20} Using model '{model_name}' {'-' * 20}")

-- Resizing the values to 100 characters during training
Reading '/home/jibesh/nalin/results/positive_examples.pkl'
Writing to /home/jibesh/nalin/results/list_of_types_in_dataset.json the types and corresponding frequencies

-------------------- Using model 'RNNClassifier_nalin' --------------------


### Data Transformations

In [16]:
resize_data = ResizeData(len_of_value=max_num_of_chars_in_value)
value_to_one_hot = ValueToCharSequence(
    len_of_value=max_num_of_chars_in_value)

one_hot_encoding_of_type = OneHotEncodingOfType(max_types_to_select=10,
                                                types_in_dataset_file_path=list_of_types_in_dataset_out_file)  # We select only top 10 types
size_of_type_encoding = len(one_hot_encoding_of_type.one_hot_init)

var_name_fastText_embd = fastTextEmbeddingOfVarName(embedding_path=token_embedding_path)
embedding_dim = var_name_fastText_embd.embedding_dim

len_repr = RepresentLen()
shape_repr = RepresentShape()

data_transformations = [resize_data,  # must be always the first transformation
                        var_name_fastText_embd,
                        value_to_one_hot,
                        one_hot_encoding_of_type,
                        len_repr,
                        shape_repr
                        ]

model = VarValueClassifierRNN(embedding_dim=embedding_dim,
                              num_of_characters_in_alphabet=value_to_one_hot.nbs_chars,
                              model_name=model_name,
                              size_of_value=resize_data.len_of_value)

assert model is not None, "Initialize a model to run training/testing"
model.to(device)



VarValueClassifierRNN(
  (criterion): BCELoss()
  (RNN_over_value): GRU(101, 100, batch_first=True, bidirectional=True)
  (convNetVal): Sequential(
    (0): Conv1d(101, 100, kernel_size=(100,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=612, out_features=150, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=150, out_features=1, bias=True)
  )
  (sigmoid): Sigmoid()
)

### Ablation

In [17]:
if len(features_to_ablate):
    ablation_transformer = AblationTransformer(features_to_ablate=features_to_ablate)
    print(f"## Not using features --> {features_to_ablate} ##")
    data_transformations.append(ablation_transformer)

### Training

In [18]:
if train:
    print(f"{'-' * 15} Reading dataset for training {'-' * 15}")
    # Read the dataset
    training_dataset, validation_dataset = get_training_val_dataset(
        positive_examples_dataset_file_path=pos_dataset_file_path,
        negative_examples_dataset_file_path=neg_dataset_file_path,
        all_transformations=data_transformations,
        nb_examples=-1)
    train_data = DataLoader(
        dataset=training_dataset, batch_size=batch_size, shuffle=True, drop_last=True, **config)
    validation_data = DataLoader(
        dataset=validation_dataset, batch_size=batch_size, shuffle=True, drop_last=True, **config)

    model.run_epochs(training_data=train_data,
                     validation_data=validation_data, num_epochs=num_epochs, results_dir=results_dir)

--------------- Reading dataset for training ---------------
Few values from the dataset -->


Unnamed: 0,file,var,value,line,type,len,shape,p_buggy,orig_type
0,nb_921790,seq1,AACC,49,str,4,-1,0.0,str
1,nb_921790,seq1,100,49,int,-1,-1,1.0,str
2,nb_893796,total,0,19,int,-1,-1,0.0,int
3,nb_893796,total,[-0.08338161 -0.07472355 -0.0661398 -0.057629...,19,ndarray,21,"(21,)",1.0,int
4,nb_401647,colormap,['red' 'lime' 'black'],145,ndarray,3,"(3,)",0.0,ndarray
...,...,...,...,...,...,...,...,...,...
980659,nb_326689,ourlist,timeseries/time-scales/,387,str,23,-1,1.0,list
980660,nb_668102,data,[[ 0. 0. 5. 13. 9. 1. 0. 0. 0. 0.]\n [...,55,ndarray,1797,"(1797, 64)",0.0,ndarray
980661,nb_668102,data,62.38339475586176,55,float,-1,-1,1.0,ndarray
980662,nb_707227,varX,0,81,int,-1,-1,0.0,int



Using 980664 examples for training and validation which contains 490332 positive examples & 490332 negative examples



HBox(children=(FloatProgress(value=0.0, description='Running Epochs', max=15.0, style=ProgressStyle(descriptio…

Batch  2400/ 6129 |  t_loss        0.52 | 
Batch  4800/ 6129 |  t_loss        0.49 | 
Batch  1200/ 1532 |  v_loss        0.37 | 
--------------------------------------------------------------------------------------------------------------
Epoch      1/  15 | Training loss= 0.48 | Validation loss= 0.37 | lr=1.00 | fscore=0.84 | took=231.75 seconds
--------------------------------------------------------------------------------------------------------------
Batch  2400/ 6129 |  t_loss        0.45 | 
Batch  4800/ 6129 |  t_loss        0.45 | 
Batch  1200/ 1532 |  v_loss        0.36 | 
--------------------------------------------------------------------------------------------------------------
Epoch      2/  15 | Training loss= 0.44 | Validation loss= 0.36 | lr=1.00 | fscore=0.83 | took=231.49 seconds
--------------------------------------------------------------------------------------------------------------
Batch  2400/ 6129 |  t_loss        0.43 | 


Exception in thread Thread-13:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jibesh/.local/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "/home/jibesh/.local/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/local/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/usr/local/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.c




KeyboardInterrupt: 

### Testing

In [None]:
if test:
    print(f"{'-' * 15} Reading dataset for testing {'-' * 15}")
    test_dataset = get_test_dataset(
        test_examples_dir=test_examples_dir,
        results_dir=results_dir,
        all_transformations=data_transformations,
        dataset_out_file=test_dataset_file_path)
    batched_test_dataset = DataLoader(
        dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=False, **config)
    model.load_model(path_to_saved_model=saved_model_path)
    predictions = model.run_testing(data=batched_test_dataset)

    test_data_with_predictions = test_dataset.data
    test_data_with_predictions['predicted_p_buggy'] = predictions

    predicted_outfile_path = os.path.join(results_dir,
                                          f'prediction_results/{Path(test_dataset_file_path).stem}_predictions.pkl')
    print(f"Writing to '{predicted_outfile_path}'")
    test_data_with_predictions.sort_values('predicted_p_buggy', ascending=False, inplace=True)
    test_data_with_predictions.reset_index(drop=True, inplace=True)
    # print(
    #     f"\n Prediction results is follows: \n\n{test_data['predicted_p_buggy'].value_counts()}")

    # test_data_with_predictions.to_csv(predicted_outfile_path)
    test_data_with_predictions.to_pickle(path=predicted_outfile_path, compression='gzip')