In [1]:
import os
from os import path
import pandas as pd
import numpy as np
import re
import math
import random

In [2]:
DATA_DIR = path.join('..', 'data', 'snippets')

In [3]:
def load_set(name, sample=1.0):
    labels = {}
    not_letter = re.compile('[^a-zA-Z]')
    with open(path.join(DATA_DIR, f'{name}_metrics.csv'), 'r') as f:
        f.readline()
        for line in f:
            sample_id, _, values = line.split(',', maxsplit=2)
            values = list(map(float, values.split(',')))
            if any(math.isnan(x) or math.isinf(x) for x in values):
                continue
            labels[sample_id] = np.array(values)
    snippets = []
    tokens = []
    metrics = []
    for sample_id, values in labels.items():
        p = path.join(DATA_DIR, name, sample_id + '.java')
        if path.exists(p) and random.random() < sample:
            with open(p, 'r', encoding='utf-8') as f:
                content = f.read()
            snippets.append(content)
            metrics.append(values)
            tokens.append(not_letter.sub(' ', content).lower().split())
    return snippets, tokens, metrics

In [4]:
val_snippets, val_tokens, val_metrics = load_set('val')

In [5]:
test_snippets, test_tokens, test_metrics = load_set('test')

In [6]:
train_snippets, train_tokens, train_metrics = load_set('train', sample=0.2)

In [7]:
from collections import Counter
import numpy as np

In [8]:
res = Counter()
for token_list in train_tokens:
    for token in token_list:
        res[token] += 1

In [9]:
ids_dict = {p[0]:i for i, p in enumerate(res.most_common(2000))}

In [10]:
val_bows = np.zeros((len(val_tokens), len(ids_dict)))
test_bows = np.zeros((len(test_tokens), len(ids_dict)))
train_bows = np.zeros((len(train_tokens), len(ids_dict)))

In [11]:
for i, tokens_list in enumerate(train_tokens):
    for token in tokens_list:
        if token in ids_dict:
            train_bows[i][ids_dict[token]] += 1

In [12]:
for i, tokens_list in enumerate(test_tokens):
    for token in tokens_list:
        if token in ids_dict:
            test_bows[i][ids_dict[token]] += 1

In [13]:
for i, tokens_list in enumerate(val_tokens):
    for token in tokens_list:
        if token in ids_dict:
            val_bows[i][ids_dict[token]] += 1

In [14]:
operators = {'{': 0, '}': 1, '(': 2, ')': 3, '+': 4, '-': 5, '*': 6, '/': 7, '=': 8, ';': 9}
val_operators = np.zeros((len(val_tokens), len(operators))) 
test_operators = np.zeros((len(test_tokens), len(operators))) 
train_operators = np.zeros((len(train_tokens), len(operators))) 

In [15]:
for i, snippet in enumerate(train_snippets):
    for op, op_id in operators.items():
        train_operators[i][op_id] = snippet.count(op)

In [16]:
for i, snippet in enumerate(val_snippets):
    for op, op_id in operators.items():
        val_operators[i][op_id] = snippet.count(op)

In [17]:
for i, snippet in enumerate(test_snippets):
    for op, op_id in operators.items():
        test_operators[i][op_id] = snippet.count(op)

In [18]:
train_X = np.concatenate([train_operators, train_bows], axis=1)
val_X = np.concatenate([val_operators, val_bows], axis=1)
test_X = np.concatenate([test_operators, test_bows], axis=1)

In [19]:
train_Y = np.stack(train_metrics)
val_Y = np.stack(val_metrics)
test_Y = np.stack(test_metrics)

In [20]:
train_Y_norm = (train_Y - train_Y.mean(axis=0)) / train_Y.std(axis=0)
val_Y_norm = (val_Y - val_Y.mean(axis=0)) / val_Y.std(axis=0)
test_Y_norm = (test_Y - test_Y.mean(axis=0)) / test_Y.std(axis=0)

In [21]:
train_lens = np.array(list(map(len, train_snippets))).reshape(-1, 1)
val_lens = np.array(list(map(len, val_snippets))).reshape(-1, 1)
test_lens = np.array(list(map(len, test_snippets))).reshape(-1, 1)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

In [23]:
def evaluate(model, train_X, train_Y, test_X, test_Y, metric):
    print('Train...')
    model.fit(train_X, train_Y)
    print('Predict...')
    test_predictions = model.predict(test_X)
    print('Calculate score...')
    return metric(test_Y, test_predictions), metric(test_Y, test_predictions, multioutput='raw_values')

In [24]:
evaluate(LinearRegression(), train_lens, train_Y_norm, test_lens, test_Y_norm, r2_score)

Train...
Predict...
Calculate score...


(0.44208609104798957,
 array([7.61068762e-02, 6.94005355e-01, 2.80624462e-01, 3.34203495e-01,
        1.40746686e-02, 8.95268345e-01, 2.43567901e-01, 2.14855650e-01,
        2.65480009e-01, 2.51361498e-01, 2.58665038e-02, 1.49099668e-01,
        1.25137254e-01, 5.27925782e-01, 3.30161177e-01, 7.70330842e-04,
        1.51678772e-01, 5.21410517e-01, 6.64248664e-01, 6.98648390e-01,
        7.38339741e-01, 5.41101590e-01, 7.00244957e-01, 3.28643120e-01,
        8.18382852e-01, 6.60202653e-01, 8.33354291e-01, 5.79048775e-01,
        3.35614765e-01, 6.30580112e-02, 8.18712988e-01, 8.48579991e-01,
        8.59061951e-01]))

In [27]:
evaluate(Lasso(), train_X, train_Y_norm, test_X, test_Y_norm, r2_score)

Train...
Predict...
Calculate score...


(0.5104502598142987,
 array([0.0884226 , 0.82122059, 0.29244093, 0.49088157, 0.01375514,
        0.89570249, 0.29213439, 0.38420851, 0.30441484, 0.30778025,
        0.02112639, 0.18652014, 0.5963828 , 0.5976969 , 0.50491976,
        0.00254533, 0.16826764, 0.57438667, 0.70823153, 0.72488866,
        0.74878846, 0.59321856, 0.70444302, 0.59358835, 0.82424484,
        0.72234906, 0.85329169, 0.64943751, 0.53277937, 0.06461932,
        0.84175017, 0.86670749, 0.8737136 ]))

In [26]:
evaluate(RandomForestRegressor(n_jobs=-1), train_X, train_Y_norm, test_X, test_Y_norm, r2_score)

Train...
Predict...
Calculate score...


(0.7598032800360006,
 array([0.6963369 , 0.91246132, 0.64200707, 0.71207062, 0.29515852,
        0.9245143 , 0.80669274, 0.71795378, 0.85827794, 0.53334605,
        0.41138909, 0.3589164 , 0.70638757, 0.86520222, 0.65000299,
        0.13660258, 0.92235492, 0.84291463, 0.85484911, 0.86680625,
        0.8869722 , 0.80884575, 0.86673574, 0.7814857 , 0.89373093,
        0.83395654, 0.91541879, 0.89652533, 0.74700369, 0.96306838,
        0.9149091 , 0.92246966, 0.92814145]))

In [25]:
evaluate(DecisionTreeRegressor(), train_X, train_Y_norm, test_X, test_Y_norm, r2_score)

Train...
Predict...
Calculate score...


(0.6217559487830768,
 array([ 0.62787301,  0.84426504,  0.40664615,  0.47806797, -0.31294398,
         0.88671239,  0.59336868,  0.41877129,  0.71912917,  0.28573883,
        -0.00213365, -0.15643778,  0.67081743,  0.72573648,  0.4586186 ,
         0.11504872,  0.83552408,  0.70696054,  0.76622208,  0.78740209,
         0.82016121,  0.71739325,  0.78260562,  0.79495244,  0.81686929,
         0.81561499,  0.85921613,  0.79389082,  0.70924798,  0.911963  ,
         0.88137491,  0.8732654 ,  0.88600411]))