In [1]:
!mkdir -p /scratch/sagarsj42/torch-cache
!mkdir -p /scratch/sagarsj42/transformers

import os
os.chdir('/scratch/sagarsj42')
os.environ['TORCH_HOME'] = '/scratch/sagarsj42/torch-cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/sagarsj42/transformers'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [30]:
import re
from functools import partial

import numpy as np
from datasets import DatasetDict, load_dataset, load_metric
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

In [3]:
TRAIN_DATA_FILE = 'irse.train.csv'
DEV_DATA_FILE = 'irse.dev.csv'
TEST_DATA_FILE = 'irse.test.csv'
GIVEN_TEST_FILE = 'irse.given-test.csv'

REMOVE_CHARS = False

In [4]:
data_files = {
    'train': TRAIN_DATA_FILE,
    'dev': DEV_DATA_FILE,
    'test': TEST_DATA_FILE,
    'giventest': GIVEN_TEST_FILE
}
ds = load_dataset('csv', data_files=data_files)

ds

Using custom data configuration default-8718147502a67e46
Found cached dataset csv (/home2/sagarsj42/.cache/huggingface/datasets/csv/default-8718147502a67e46/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 4/4 [00:00<00:00, 380.89it/s]


DatasetDict({
    train: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 5354
    })
    dev: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 595
    })
    test: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 678
    })
    giventest: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label'],
        num_rows: 1001
    })
})

In [5]:
def preprocess_data(sample, remove_chars):
    comment = sample['Comments']
    context = sample['Surrounding Code Context']
    context = context.replace(comment, ' ')
    if remove_chars:
        comment = re.sub('\W+', ' ' ,comment)
        context = re.sub('\W+', ' ' ,context)
    
    return_dict = {
        'comment': comment,
        'context': context
    }

    return return_dict

In [6]:
preprocess_partial = partial(preprocess_data, remove_chars=REMOVE_CHARS)
ds_pp = dict()

ds_pp['train'] = ds['train'].map(preprocess_partial, batched=False)
ds_pp['dev'] = ds['dev'].map(preprocess_partial, batched=False)
ds_pp['test'] = ds['test'].map(preprocess_partial, batched=False)
ds_pp['giventest'] = ds['giventest'].map(preprocess_partial, batched=False)
ds_pp = DatasetDict(ds_pp)

ds_pp

Loading cached processed dataset at /home2/sagarsj42/.cache/huggingface/datasets/csv/default-8718147502a67e46/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-eaf13875048cf4cc.arrow
Loading cached processed dataset at /home2/sagarsj42/.cache/huggingface/datasets/csv/default-8718147502a67e46/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-c92fa421578ddeeb.arrow
Loading cached processed dataset at /home2/sagarsj42/.cache/huggingface/datasets/csv/default-8718147502a67e46/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-b9578dff5d840a1e.arrow
Loading cached processed dataset at /home2/sagarsj42/.cache/huggingface/datasets/csv/default-8718147502a67e46/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-dc5317f3a1302c86.arrow


DatasetDict({
    train: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'comment', 'context'],
        num_rows: 5354
    })
    dev: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'comment', 'context'],
        num_rows: 595
    })
    test: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'comment', 'context'],
        num_rows: 678
    })
    giventest: Dataset({
        features: ['Comments', 'Surrounding Code Context', 'Class', 'label', 'comment', 'context'],
        num_rows: 1001
    })
})

In [7]:
def compute_metrics(predictions, labels, scores=None):
    acc_metric = load_metric('accuracy')
    f1_metric = load_metric('f1')
    mcc_metric = load_metric('matthews_correlation')

    metrics_dict = {
        'accuracy': acc_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'f1_score': f1_metric.compute(predictions=predictions, references=labels)['f1'],
        'matthews_cc': mcc_metric.compute(predictions=predictions, references=labels)['matthews_correlation'],
    }

    if scores is not None:
        roc_metric = load_metric('roc_auc')
        metrics_dict['roc_auc_score'] = roc_metric.compute(prediction_scores=scores, references=labels)['roc_auc']

    return metrics_dict

In [8]:
comment_vectorizer = TfidfVectorizer()
context_vectorizer = TfidfVectorizer()

comment_vectorizer, context_vectorizer

(TfidfVectorizer(), TfidfVectorizer())

In [9]:
comment_train = comment_vectorizer.fit_transform(ds_pp['train']['comment']).toarray()
context_train = context_vectorizer.fit_transform(ds_pp['train']['context']).toarray()
x_train = np.concatenate([comment_train, context_train], axis=1)
y_train = np.array(ds_pp['train']['label'])

comment_train.shape, context_train.shape, x_train.shape, y_train.shape

((5354, 5968), (5354, 12310), (5354, 18278), (5354,))

In [10]:
comment_dev = comment_vectorizer.transform(ds_pp['dev']['comment']).toarray()
context_dev = context_vectorizer.transform(ds_pp['dev']['context']).toarray()
x_dev = np.concatenate([comment_dev, context_dev], axis=1)
y_dev = np.array(ds_pp['dev']['label'])

comment_dev.shape, context_dev.shape, x_dev.shape, y_dev.shape

((595, 5968), (595, 12310), (595, 18278), (595,))

In [11]:
comment_test = comment_vectorizer.transform(ds_pp['test']['comment']).toarray()
context_test = context_vectorizer.transform(ds_pp['test']['context']).toarray()
x_test = np.concatenate([comment_test, context_test], axis=1)
y_test = np.array(ds_pp['test']['label'])

comment_test.shape, context_test.shape, x_test.shape, y_test.shape

((678, 5968), (678, 12310), (678, 18278), (678,))

In [12]:
random_forest = RandomForestClassifier(n_estimators=25)

random_forest

In [13]:
random_forest.fit(x_train, y_train)
o_train = random_forest.predict(x_train)
o_dev = random_forest.predict(x_dev)
o_test = random_forest.predict(x_test)

o_train.shape, o_dev.shape, o_test.shape

((5354,), (595,), (678,))

In [14]:
s_train = random_forest.predict_proba(x_train)[:, -1]
s_dev = random_forest.predict_proba(x_dev)[:, -1]
s_test = random_forest.predict_proba(x_test)[:, -1]

s_train.shape, s_dev.shape, s_test.shape

((5354,), (595,), (678,))

In [15]:
random_forest_metrics = {
    'train': compute_metrics(o_train, y_train, s_train),
    'dev': compute_metrics(o_dev, y_dev, s_dev),
    'test': compute_metrics(o_test, y_test, s_test)
}

random_forest_metrics

  acc_metric = load_metric('accuracy')


{'train': {'accuracy': 0.921180425849832,
  'f1_score': 0.9313374552554506,
  'matthews_cc': 0.8390293338388803,
  'roc_auc_score': 0.9861706718815253},
 'dev': {'accuracy': 0.680672268907563,
  'f1_score': 0.7246376811594203,
  'matthews_cc': 0.34541230407456724,
  'roc_auc_score': 0.6978794925517464},
 'test': {'accuracy': 0.8539823008849557,
  'f1_score': 0.8,
  'matthews_cc': 0.6960400682267076,
  'roc_auc_score': 0.9257721702166146}}

In [50]:
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
svc = SVC(kernel='linear', C=20.0, max_iter=1000, probability=True)
svc_pipe = Pipeline([
    ('scaler', scaler),
    ('svc', svc)
])

svc_pipe

In [51]:
svc_pipe.fit(x_train, y_train)
o_train = svc_pipe.predict(x_train)
o_dev = svc_pipe.predict(x_dev)
o_test = svc_pipe.predict(x_test)

o_train.shape, o_dev.shape, o_test.shape



((5354,), (595,), (678,))

In [52]:
s_train = svc.predict_proba(x_train)[:, 1]
s_dev = svc.predict_proba(x_dev)[:, 1]
s_test = svc.predict_proba(x_test)[:, 1]

s_train.shape, s_dev.shape, s_test.shape

((5354,), (595,), (678,))

In [53]:
svc_metrics = {
    'train': compute_metrics(o_train, y_train, s_train),
    'dev': compute_metrics(o_dev, y_dev, s_dev),
    'test': compute_metrics(o_test, y_test, s_test)
}

svc_metrics

{'train': {'accuracy': 0.6462457975345536,
  'f1_score': 0.5928632846087704,
  'matthews_cc': 0.3829736301722036,
  'roc_auc_score': 0.7060991999053223},
 'dev': {'accuracy': 0.5361344537815126,
  'f1_score': 0.44129554655870445,
  'matthews_cc': 0.15722945406777053,
  'roc_auc_score': 0.6739978817949486},
 'test': {'accuracy': 0.6386430678466076,
  'f1_score': 0.20195439739413681,
  'matthews_cc': 0.23852223686155372,
  'roc_auc_score': 0.8466332021887578}}

In [54]:
mlp = MLPClassifier(hidden_layer_sizes=[1000, 100], activation='relu', 
    solver='adam', learning_rate='constant', learning_rate_init=1e-4, 
    max_iter=10)

mlp

In [55]:
mlp.fit(x_train, y_train)
o_train = mlp.predict(x_train)
o_dev = mlp.predict(x_dev)
o_test = mlp.predict(x_test)

o_train.shape, o_dev.shape, o_test.shape



((5354,), (595,), (678,))

In [62]:
s_train = mlp.predict_proba(x_train)[:, 1]
s_dev = mlp.predict_proba(x_dev)[:, 1]
s_test = mlp.predict_proba(x_test)[:, 1]

s_train.shape, s_dev.shape, s_test.shape

((5354,), (595,), (678,))

In [63]:
mlp_metrics = {
    'train': compute_metrics(o_train, y_train, s_train),
    'dev': compute_metrics(o_dev, y_dev, s_dev),
    'test': compute_metrics(o_test, y_test, s_test)
}

mlp_metrics

{'train': {'accuracy': 0.9133358236832275,
  'f1_score': 0.9224598930481284,
  'matthews_cc': 0.8250006345148544,
  'roc_auc_score': 0.9823797951912385},
 'dev': {'accuracy': 0.6823529411764706,
  'f1_score': 0.7303851640513553,
  'matthews_cc': 0.34637839995966363,
  'roc_auc_score': 0.7686724380079664},
 'test': {'accuracy': 0.7256637168141593,
  'f1_score': 0.5507246376811594,
  'matthews_cc': 0.42404797926599896,
  'roc_auc_score': 0.8178356622801067}}