Imports needed for the Project

In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset

device=torch.device('cpu')

Data Reading and Processing

In [17]:
data = pd.read_csv('sampled.csv')
data = data.reset_index()

# removing smells classes exceptionCatchingThrowing and dependentTest becouse not has value
data = data.drop(columns=['commit', 'testClass', 'testMethod', 'testFilePath', 'productionFilePath', 'relativeTestFilePath', 'relativeProductionFilePath', 'tsTestClass', 'tsTestMethod', 'is_sampled', 'dependentTest', 'exceptionCatchingThrowing', 'vocabulary', 'tokens_parser', 'keywords_parser', 'strings_parser', 'string_type_parser', 'anotations_parser'], axis=1)
#Removing additional columns to make dataset compatible with Neural Network, keeping only numerical values
data=data.drop(columns=['project', 'dataset'])

data = data.replace(np.nan, 0)
data = data.replace(True, 1)
data = data.replace(False, 0)

data = data.replace('flaky', 1)
data = data.replace('nonflaky', 0)
print(data.head(5))
data=data.sample(frac=1)
#Shuffle Data
data.to_csv("processed_data.csv", sep=",", index=False)

   index  loc  smellsCount  assertionRoulette  conditionalTestLogic  \
0      0   85            4                  1                     1   
1      1   18            4                  1                     1   
2      2   25            2                  0                     0   
3      3   21            4                  1                     0   
4      4   40            3                  1                     0   

   constructorInitialization  defaultTest  duplicateAssert  eagerTest  \
0                          0            0                0          1   
1                          0            0                0          0   
2                          0            0                0          0   
3                          0            0                0          1   
4                          0            0                1          0   

   emptyTest  ...  magicNumberTest  mysteryGuest  printStatement  \
0          0  ...                0             0               0  

Neural Network Hyper-Parameters

In [None]:
input_size=23
hidden_size=100
num_classes=6
num_epochs=10
batch_size=300
learning_rate=0.0001

DataLoader for the neural network pipeline

In [None]:
class LogDatasetTrain(Dataset): 
    def __init__(self):
        xy=np.loadtxt("./processed_data.csv", delimiter=",", dtype=np.float32, skiprows=1)
        self.y=torch.from_numpy(xy[:, -6:]) #Take last col
        self.x=torch.from_numpy(xy[:, :-6]) #Take all but the last col
        self.n_samples = xy.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

In [None]:
class LogDatasetTest(Dataset): 
    def __init__(self):
        xy=np.loadtxt("./data/test_data.csv", delimiter=",", dtype=np.float32, skiprows=1)
        self.y=torch.from_numpy(xy[:, -6:]) #Take last col
        self.x=torch.from_numpy(xy[:, :-6]) #Take all but the last col
        self.n_samples = xy.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

Neural Network Class using PyTorch

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1=nn.Linear(input_size, hidden_size)
        self.relu=nn.ReLU()
        self.l2=nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out=self.l1(x)
        out=self.relu(out)
        out=self.l2(out)
        return out

Loading Data to DataLoader

In [None]:
train_data=LogDatasetTrain()
test_data=LogDatasetTest()
train_loader=torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader=torch.utils.data.DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)

Creation of Neural Network

In [None]:
model=NeuralNet(input_size, hidden_size, num_classes)
#loss and optimizer
criterion=nn.BCEWithLogitsLoss()
optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate)

Training Loop

In [None]:
n_total_steps=len(train_loader)
for epoch in range(num_epochs):
    for i, (x_tr, y_tr) in enumerate(train_loader):

        #forward
        y_pred=model(x_tr)
        #print(y_pred.shape)
        #print(y_tr.shape)
        #print(type(y_pred[0]))
        #print(type(y_tr[0]))
        y_tr=y_tr.type(torch.FloatTensor)
        loss=criterion(y_pred, y_tr)

        #backwards
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1)%100 == 0:
            print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{n_total_steps}, loss = {loss.item():.4f}')

Test Loop

In [None]:
with torch.no_grad():
    n_correct=0
    n_samples=0
    for images, labels in test_loader:
        outputs = model(images)
        pred=[]
        ans=[]
        _, predicictions = torch.max(outputs, 1)
        _, lab = torch.max(labels, 1)
        for l in labels:
            if l[0]==1.0:
                ans.append(0)
            elif l[1]==1.0:
                ans.append(1)
            elif l[2]==1.0:
                ans.append(2)
            elif l[3]==1.0:
                ans.append(3)
            elif l[4]==1.0:
                ans.append(4)
            elif l[5]==1.0:
                ans.append(5)
            
        for o in predicictions:
            if o == 0:
                pred.append(0)
            elif o == 1:
                pred.append(1)
            elif o == 2:
                pred.append(2)
            elif o == 3:
                pred.append(3)
            elif o == 4:
                pred.append(4)
            elif o == 5:
                pred.append(5)

        for i in range(len(ans)):
            if ans[i]==pred[i]:
                n_correct+=1
        n_samples += labels.shape[0]
    acc=100.0*n_correct/n_samples
    print(f'Accuracy = {acc}')