In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import multiprocessing as mp
from pathos.multiprocessing import ProcessingPool
from pathlib import Path
from sklearn.metrics import r2_score as r2, mean_squared_error as mse
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, Dataset, DataLoader
from torch.optim import Adam, SGD
from torch.nn import functional as F
import random
import os

In [2]:
DATASET_PATH = Path('../samples')
EMBEDDINGS_PATH =  DATASET_PATH / 'embeddings'
TARGETS_PATH = Path('../target_metrics/snippets')

train_metrics = pd.read_csv(TARGETS_PATH / 'train_metrics.csv')
#train = pd.read_csv(TARGETS_PATH / 'train.csv')

target_metrics = set(train_metrics.columns) - set(['id', 'shortMethodName'])
print('Target metrics', target_metrics)
print('Total targets', len(target_metrics))

Target metrics {'methodParenthesizedExpsQty', 'QCPReliability', 'HalsteadVocabularyMethod', 'methodSubClassesQty', 'methodVariablesQty', 'ControlDensity', 'methodComparisonsQty', 'HalsteadVolumeMethod', 'HalsteadEffortMethod', 'methodMathOperationsQty', 'methodRfc', 'HalsteadLengthMethod', 'methodNumbersQty', 'methodUniqueWordsQty', 'QCPMaintainability', 'methodLoc', 'methodWmc', 'DesignComplexity', 'methodLambdasQty', 'methodStringLiteralsQty', 'methodAssignmentsQty', 'methodLoopQty', 'HalsteadDifficultyMethod', 'methodCbo', 'methodParametersQty', 'methodAnonymousClassesQty', 'CyclomaticComplexity', 'QCPCorrectness', 'EssentialCyclomaticComplexity', 'methodMaxNestedBlocks', 'HalsteadBugsMethod', 'methodTryCatchQty', 'methodReturnQty'}
Total targets 33


In [4]:
def get_raw_data(data_type):
    emb_path = EMBEDDINGS_PATH / data_type
    chunks = np.array_split(os.listdir(emb_path), mp.cpu_count())
    
    def process_chunk(files):
        data = {}
        
        for file in files:
            file_path = emb_path / file
            with open(file_path, 'r') as input_file:
                lines = input_file.readlines()

            raw_string = ''.join(lines).replace('\n', '').replace('[', '').replace(']', '')

            if 'None' not in raw_string:   
                embedding = np.fromstring(raw_string, dtype=np.float, sep='   ')
                #embedding = torch.FloatTensor(embedding)
            else:
                embedding = None
                
            data[file] = embedding
            
        return data

    with ProcessingPool(mp.cpu_count()) as pool:
        proc_chunks = list(pool.map(process_chunk, chunks))
    
    merged = {}
    for dict_chunk in proc_chunks:
        merged = {**merged, **dict_chunk}
    return merged

CPU times: user 4 µs, sys: 4 µs, total: 8 µs
Wall time: 16.7 µs


In [5]:
class FullDataset(Dataset):
    
    def __init__(self, raw_data, data_type, target_metric):
        emb_path = EMBEDDINGS_PATH / data_type
        path = Path('../samples/') / data_type
        self.file2anonfile = {}
        for idx, file in enumerate(os.listdir(path)):
            self.file2anonfile[file] = f'{data_type}_{idx}'
        
        self.data = [(file, embedding) for file, embedding in raw_data.items() if embedding is not None]
        
        target_df = pd.read_csv(TARGETS_PATH / f'{data_type}_metrics.csv')
        file_metric_df = target_df[['id', target_metric]]
        self.targets = file_metric_df.set_index('id').to_dict()[target_metric]
    
    def __getitem__(self, index):
        file, embedding = self.data[index]
        anon_file = self.file2anonfile[file]
        target = self.targets[anon_file]
        
        return embedding, target

    def __len__(self):
        return len(self.data)

In [6]:
def collect_data(dataset_full):
    X, y = [], []

    for index in range(len(dataset_full)):
        emb, target = dataset_full[index]
        
        if target != target: # nan check
            continue
        
        X.append(emb)
        y.append(target)

    X = np.array(X)
    y = np.array(y)
    y = (y - np.mean(y)) / np.std(y)
    return torch.FloatTensor(X), torch.FloatTensor(y)

In [7]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.mlp_net = nn.Sequential(
            nn.Linear(384, 1), 
            #nn.ReLU()
            #nn.Sigmoid()
        )

    def forward(self, input_embedding):
        return self.mlp_net(input_embedding)

In [8]:
train_raw_data = get_raw_data('train')
val_raw_data = get_raw_data('val')
test_raw_data = get_raw_data('test')

In [14]:
device = torch.device("cuda:1")

def evaluate(model, features, targets, log_output):
    batch_embeddings = features.to(device)
    batch_targets = targets.float() 

    output = model(batch_embeddings).view(-1).cpu().detach().numpy()
    mse_error = mse(batch_targets, output)
    r2_error = r2(batch_targets, output)
        
    print(f'MSE {mse_error}', f'R2 {r2_error}')
    with open('test_results.txt', 'a') as logs:
        logs.write(f'MLP {log_output} MSE {mse_error} R2 {r2_error}\n')
        

for target_metric in target_metrics:
    print("Evaluating", target_metric)
    with open('test_results.txt', 'a') as logs:
        logs.write(f'MLP Evaluating {target_metric}\n')
    train_full = FullDataset(train_raw_data, 'train', target_metric)
    val_full = FullDataset(val_raw_data, 'val', target_metric)
    test_full = FullDataset(test_raw_data, 'test', target_metric)
    
    X_train, y_train = collect_data(train_full)
    X_val, y_val = collect_data(val_full)
    X_test, y_test = collect_data(test_full)
    mlp = MLP()
    
    mlp.to(device)
    optimizer = SGD(mlp.parameters(), lr=1e-1) # or Adam
    start_time = datetime.now()
    total_loss = 0.
    total_epochs = 0
    for epoch in range(1000):

        batch_embeddings = X_train.to(device)
        batch_targets = y_train.float().to(device)
        output = mlp(batch_embeddings).view(-1)
        loss = F.mse_loss(output, batch_targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        total_loss += loss.item()
        total_epochs += 1

        if (epoch + 1) % 100 == 0 or epoch == 0:
            print('validating...')
            evaluate(mlp, X_val, y_val, 'validate')
            print(f'EPOCH {epoch + 1} avg mse loss', total_loss / total_epochs)
            total_loss = 0.
            total_epochs = 0
    break
    print('testing', datetime.now() - start_time)
    evaluate(mlp, X_test, y_test, 'test')
    with open('test_results.txt', 'a') as logs:
        logs.write(f'==================\n')

Evaluating methodParenthesizedExpsQty
validating...
MSE 1.0852329730987549 R2 -0.08523298852383165
EPOCH 1 avg mse loss 1.0919668674468994
validating...
MSE 1.012900710105896 R2 -0.012900664311151555
EPOCH 100 avg mse loss 1.0336980145386976
validating...
MSE 1.0065199136734009 R2 -0.006519919491435822
EPOCH 200 avg mse loss 1.010066397190094
validating...
MSE 1.0046371221542358 R2 -0.0046371677232848185
EPOCH 300 avg mse loss 1.0061129331588745
validating...
MSE 1.0037156343460083 R2 -0.0037156054949292194
EPOCH 400 avg mse loss 1.0044524836540223
validating...
MSE 1.0031365156173706 R2 -0.0031365148708417667
EPOCH 500 avg mse loss 1.0034582602977753
validating...
MSE 1.0027213096618652 R2 -0.002721328196156003
EPOCH 600 avg mse loss 1.0027616226673126
validating...
MSE 1.0024009943008423 R2 -0.002400983742018248
EPOCH 700 avg mse loss 1.0022321486473083
validating...
MSE 1.002143383026123 R2 -0.002143374929086672
EPOCH 800 avg mse loss 1.001810418367386
validating...
MSE 1.0019313097

In [15]:
x = X_train[:10].to(device)
y = y_train[:10].to(device)

print(y)
print(mlp(x))

tensor([-0.1612,  0.3412, -0.1612, -0.1612, -0.1612, -0.1612, -0.1612, -0.1612,
        -0.1612, -0.1612], device='cuda:1')
tensor([[ 0.0488],
        [ 0.1116],
        [ 0.0076],
        [-0.0244],
        [ 0.0676],
        [ 0.0484],
        [ 0.0487],
        [ 0.0037],
        [ 0.0025],
        [-0.0021]], device='cuda:1', grad_fn=<AddmmBackward>)


In [11]:
#train_raw_data = get_raw_data('train')
#val_raw_data = get_raw_data('val')
#test_raw_data = get_raw_data('test')

In [12]:
# def create_xgb(X_train, y_train):
#     final_params = {}
#     final_params['objective'] = 'reg:squarederror'
#     final_params['tree_method'] = 'gpu_hist'
#     final_params['learning_rate'] = 0.01
#     final_params['n_estimators'] = 500
#     # final_params['reg_alpha'] = 1
#     # final_params['subsample'] = 0.66
#     # final_params['colsample_bytree'] = 0.96
#     # final_params['max_depth'] = 10
#     # final_params['min_child_weight'] = 0.25

#     gbdt = xgb.XGBRegressor(**final_params)
#     gbdt.fit(X_train, y_train, verbose=True)
#     return gbdt

# def create_rf(X_train, y_train):
#     clf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
#     clf.fit(X_train, y_train)
#     return clf

# def create_lasso(X_train, y_train):
#     clf = Lasso()
#     clf.fit(X_train, y_train)
#     return clf

# def evaluate_classic_model(model, str_model, features, targets, log_output):
#     output = model.predict(features)
#     mse_error = mse(targets, output)
#     r2_error = r2(targets, output)
        
#     print(f'MSE {mse_error}', f'R2 {r2_error}')
#     with open('test_results_others.txt', 'a') as logs:
#         logs.write(f'{str_model} {log_output} MSE {mse_error} R2 {r2_error}\n')

# for target_metric in target_metrics:
#     print("Evaluating", target_metric)
#     with open('test_results_others.txt', 'a') as logs:
#         logs.write(f'Evaluating {target_metric}\n')
#     train_full = FullDataset(train_raw_data, 'train', target_metric)
#     #val_full = FullDataset('val', target_metric)
#     test_full = FullDataset(test_raw_data, 'test', target_metric)
    
#     X_train, y_train = collect_data(train_full)
#     #X_val, y_val = collect_data(val_full)
#     X_test, y_test = collect_data(test_full)
#     #print('train')
#     xgb = create_xgb(X_train, y_train)
#     rf = create_rf(X_train, y_train)
#     lasso = create_lasso(X_train, y_train)
    
#     print('testing', datetime.now() - start_time)
#     for model, str_model in zip([xgb, rf, lasso], ['xgb', 'rf', 'lasso']):
#         evaluate_classic_model(model, str_model, X_test, y_test, 'test')
#     with open('test_results_others.txt', 'a') as logs:
#         logs.write(f'==================\n')