Mount Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Get Data From Csv File

In [None]:
import pandas as pd
import numpy as np

# Dataset Length: 300201 
def get_csv_data(csv_path, feature_len = 32):
  csv_data = pd.read_csv(csv_path, sep=',', header=None)
  data = csv_data.values.astype(np.float)[:, 0:2*feature_len]
  labels = csv_data.values.astype(np.float)[:,2*feature_len:]

  total_data = np.hstack((data, labels))

  np.random.shuffle(total_data)

  shuffled_data = total_data[:, :-1]
  shuffled_labels = total_data[:, -1]

  left_data = shuffled_data[:, 0:feature_len]
  right_data = shuffled_data[:, feature_len: 2*feature_len]
  
  return left_data, right_data, shuffled_labels

Dataloader for the Dataset

In [None]:
import torch
import torchvision
import torch.utils.data as Data

class Dataloader(Data.Dataset):
  def __init__(self, left_arr, right_arr, labels):
    super(Dataloader).__init__()

    self.left_tensor = torch.from_numpy(left_arr).float()
    self.right_tensor = torch.from_numpy(right_arr).float()
    self.label = torch.from_numpy(labels).long()
    self.len = len(labels)


  def __len__(self):
    return self.len
  
  def __getitem__(self, idx):
    return (self.left_tensor[idx], self.right_tensor[idx], self.label[idx])

Contrastive Loss Function


In [None]:
import torch
import torch.nn
import torch.nn.functional as F
class ContrastiveLoss(torch.nn.Module):

    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        pos = (1-label) * torch.pow(euclidean_distance, 2)
        neg = (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2)
        loss_contrastive = torch.mean( pos + neg )
        return loss_contrastive

Fully Connected Siamese NN Architecture

In [None]:
import torch
import torch.nn as nn
import torchvision
from torch import optim
import numpy as np

# input: [batch_size, in_channels, features]
class Late_Merge_Siamese(nn.Module):
  def __init__(self, hyperparam, activ_bool = True):
    super(Late_Merge_Siamese, self).__init__()

    self.activ_func = nn.ReLU(inplace = activ_bool)

    self.cnn_1 = nn.Sequential(
      nn.Conv1d(1, hyperparam['cnn_1'], kernel_size = 3),
      self.activ_func,
      #nn.MaxPool1d(hyperparam['max_pool_1'])
    )

    self.cnn_2 = nn.Sequential(
      nn.Conv1d(hyperparam['cnn_1'], hyperparam['cnn_2'], kernel_size = 3),
      self.activ_func,
      nn.BatchNorm1d(num_features = 64),
      nn.Dropout(0.2),
      nn.MaxPool1d(hyperparam['max_pool_2'])
    )

    # Add avg adaptive pooling and see
    self.linear = nn.Sequential(
      nn.Linear(14*hyperparam['cnn_2'], hyperparam['lin_1']),
      self.activ_func,
      nn.Linear(hyperparam['lin_1'], hyperparam['lin_2']),
      self.activ_func,
      nn.Dropout(0.2),
      nn.Linear(hyperparam['lin_2'], hyperparam['lin_3']),
    )

  def forward_once(self, x):
    x1 = self.cnn_1(x)
    x2 = self.cnn_2(x1)
    #print(x2.shape)

    x2_out = x2.view(x2.size()[0], -1)
    out = self.linear(x2_out)

    return out

  def forward(self, left, right):
    out2 = self.forward_once(right)
    out1 = self.forward_once(left)

    return out1, out2


Second Architecture


In [None]:
class Inter_Merge_Siamese(nn.Module):
  def __init__(self, hyperparam, activ_bool = False):
    super(Inter_Merge_Siamese, self).__init__()

    self.activ_func = nn.LeakyReLU(inplace=activ_bool)

    self.cnn_1 = nn.Sequential(
      nn.Conv1d(1, hyperparam['cnn_1'], kernel_size = 3),
      self.activ_func,
      #nn.MaxPool2d(hyperparam['max_pool_1'])
    )

    self.cnn_2 = nn.Sequential(
      nn.Conv1d(hyperparam['cnn_1'], hyperparam['cnn_2'], kernel_size = 3),
      self.activ_func,
      nn.BatchNorm1d(num_features = 64),
      nn.Dropout(0.2),
      nn.MaxPool1d(hyperparam['max_pool_2'])
    )

    self.linear = nn.Sequential(
      nn.Linear(14*hyperparam['cnn_2'], hyperparam['lin_1']),
      self.activ_func,
      nn.Linear(hyperparam['lin_1'], hyperparam['lin_2']),
      self.activ_func,
      nn.Dropout(0.2),
      nn.Linear(hyperparam['lin_2'], hyperparam['lin_3']),
      self.activ_func,
    )
    self.fc = nn.Linear(hyperparam['lin_3'], 2)

  def forward_once(self, x):
    x1 = self.cnn_1(x)
    x2 = self.cnn_2(x1)
    #print(x2.shape)

    x2_out = x2.view(x2.size()[0], -1)
    lin_out = self.linear(x2_out)

    return lin_out

  def forward(self, left, right):
    out1 = self.forward_once(left)
    out2 = self.forward_once(right)

    out_diff = torch.abs(out1 - out2)
    out = self.fc(out_diff)

    return out


In [None]:
class NoSiamese(nn.Module):
  def __init__(self, hyperparam, activ_bool = False):
    super(NoSiamese, self).__init__()

    self.activ_func = nn.ReLU(inplace = activ_bool)

    self.cnn_1 = nn.Sequential(
      nn.Conv1d(1, hyperparam['cnn_1'], kernel_size = 3),
      self.activ_func,
      #nn.MaxPool2d(hyperparam['max_pool_1'])
    )

    self.cnn_2 = nn.Sequential(
      nn.Conv1d(hyperparam['cnn_1'], hyperparam['cnn_2'], kernel_size = 3),
      self.activ_func,
      nn.BatchNorm1d(num_features = 64),
      nn.Dropout(0.2),
      nn.MaxPool1d(hyperparam['max_pool_2'])
    )

    self.linear = nn.Sequential(
      nn.Linear(30*hyperparam['cnn_2'], hyperparam['lin_1']),
      self.activ_func,
      nn.Linear(hyperparam['lin_1'], hyperparam['lin_2']),
      self.activ_func,
      nn.Dropout(0.2),
      nn.Linear(hyperparam['lin_2'], hyperparam['lin_3']),
      self.activ_func,
    )
    self.fc = nn.Linear(hyperparam['lin_3'], 2)


  def forward(self, x):
    x1 = self.cnn_1(x)
    x2 = self.cnn_2(x1)
    x2_out = x2.view(x2.size()[0], -1)
    out_lin = self.linear(x2_out)
    out = self.fc(out_lin)

    return out

Install TensorboardX

In [None]:
!pip install TensorboardX

Collecting TensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)
[K     |██▊                             | 10kB 21.0MB/s eta 0:00:01[K     |█████▍                          | 20kB 28.6MB/s eta 0:00:01[K     |████████▏                       | 30kB 22.8MB/s eta 0:00:01[K     |██████████▉                     | 40kB 18.2MB/s eta 0:00:01[K     |█████████████▋                  | 51kB 9.8MB/s eta 0:00:01[K     |████████████████▎               | 61kB 8.8MB/s eta 0:00:01[K     |███████████████████             | 71kB 10.0MB/s eta 0:00:01[K     |█████████████████████▊          | 81kB 10.9MB/s eta 0:00:01[K     |████████████████████████▌       | 92kB 11.1MB/s eta 0:00:01[K     |███████████████████████████▏    | 102kB 8.1MB/s eta 0:00:01[K     |██████████████████████████████  | 112kB 8.1MB/s eta 0:00:01[K     |████████████████████████████████| 12

Training the model

In [None]:
def threshold_sigmoid(t):
  """prob > 0.5 --> 1 else 0"""
  threashold = t.clone()
  threashold.data.fill_(0.5)
  return (t > threashold).float()

def threshold_contrastive(input1, input2, margin = 2.0):
    """dist < m --> 1 else 0"""
    diff = input1 - input2
    dist_sq = torch.sum(torch.pow(diff, 2), 1)
    dist = torch.sqrt(dist_sq)
    threshold = dist.clone()
    threshold.data.fill_(margin)
    return (dist < threshold).float().view(-1, 1)

In [None]:
def count(T):
  return torch.count_nonzero(T).item()

In [None]:
import torch
import torchvision
import torch.optim as optim
import torch.nn as nn
import time
import os
import torch.utils.data as Data
from tensorboardX import SummaryWriter
from tqdm import tqdm
from torch.autograd import Variable
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score

"""
To view the data written by tensorboardX
tensorboard --logdir <path of logs directory>
In my case, pathdir = 'logs/'
"""

#os.makedirs('/drive/MyDrive', SAVE_DIR, exist_ok=True)

hyperparam = {
    'cnn_1': 32,
    'cnn_2': 64,
    'max_pool_1': 2,
    'max_pool_2': 2,
    'lin_1': 512,
    'lin_2': 256,
    'lin_3': 128,      
  }


def init_weights(model):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)

def train(left, right, labels, arch, contra_loss = True):
  data = Dataloader(left, right, labels)
  logger = SummaryWriter(os.path.join(HOME, LOG_DIR, TIME + ': Code Clone'))
  dataset_len = len(labels)

  opt = {
      'batch_sz': 100,
      'lr': 0.0001,
      'epochs': 30,
      'momentum': 0.09,
      'train_len': int(0.70*dataset_len),
      'val_len': int(0.85*dataset_len),
      'test_len': int(dataset_len),
  }

  architecture_dict = {
    1: Late_Merge_Siamese(hyperparam),
    2: NoSiamese(hyperparam),
    3: Inter_Merge_Siamese(hyperparam)
}

  train_loss_arr = []
  val_loss_arr = []

  feature_len = 32
  model = architecture_dict[arch]
  #model = Inter_Merge_Siamese(hyperparam)
  #model.apply(init_weights)

  train_loader = Data.DataLoader(Data.Subset(data, range(opt['train_len'])), batch_size = opt['batch_sz'], shuffle = True)
  val_loader = Data.DataLoader(Data.Subset(data, range(opt['train_len'], opt['val_len'])), batch_size = opt['batch_sz'] ,shuffle = True)
  test_loader = Data.DataLoader(Data.Subset(data, range(opt['val_len'], opt['test_len'])), batch_size = opt['batch_sz'],shuffle = True)

  optimizer = optim.Adam(model.parameters(), lr = opt['lr'])
  #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
  
  if arch == 1:
    loss_fn = ContrastiveLoss()
  else:
    loss_fn = torch.nn.CrossEntropyLoss()
  
  #loss_fn = nn.CrossEntropyLoss()
  model.cuda()

  print('-----------------BEGIN TRAINING-------------------')

  for epoch in range(opt['epochs']):
    train_loss = 0.0
    model.train()
    a = list(model.parameters())[0].clone()
    train_len = 0.0
    val_len = 0.0

    for left_vec, right_vec, label in train_loader:
      
      left_vec, right_vec= torch.unsqueeze(left_vec, 1), torch.unsqueeze(right_vec, 1)
      left_vec = left_vec.to(device)
      right_vec = right_vec.to(device)
      label = label.to(device)
      
      optimizer.zero_grad()
      
      if arch == 1: # LATE_MERGE_SIAMESE
        out1, out2 = model(left_vec, right_vec)
        loss = loss_fn(out1, out2, label)
      elif arch == 2: # NO_SIAMESE
        cat_vec = torch.cat((left_vec, right_vec),2)
        out = model(cat_vec)
        loss = loss_fn(out, label)
      else: # INTERMEDIATE_MERGE_SIAMESE
        out = model(left_vec, right_vec)
        loss = loss_fn(out, label)
      
      #out = model(left_vec, right_vec)
      #loss = loss_fn(out,label)
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      train_len += label.shape[0]

    b = list(model.parameters())[0].clone()
    compare = torch.equal(a.data,b.data)
    print('BOOL: ', compare)
    logger.add_scalar('Training_loss', train_loss/train_len, epoch+1)
    print()
    print('EPOCH: ', epoch, '---', train_loss/train_len)
    train_loss_arr.append(train_loss/train_len)

    print('---------------------------BEGIN VALIDATION---------------------------')
    
    val_loss = 0.0
    val_acc = 0.0
    
    temp = True
    model.eval()
    with torch.no_grad():
      for left_vec, right_vec, label in val_loader:
        left_vec, right_vec = torch.unsqueeze(left_vec, 1), torch.unsqueeze(right_vec, 1)
        left_vec = left_vec.to(device)
        right_vec = right_vec.to(device)
        label = label.to(device)
        
        if arch == 1: # LATE_MERGE_LOSS
          out1, out2 = model(left_vec, right_vec)
          loss = loss_fn(out1, out2, label)

          if contra_loss:
            output_labels = threshold_contrastive(out1, out2)
          else:
            eucledian_distance = F.pairwise_distance(out1, out2)
            output_labels = torch.sigmoid(eucledian_distance)

        elif arch == 2: # NO_SIAMESE
          cat_vec = torch.cat((left_vec, right_vec),2)
          out = model(cat_vec)
          output_labels = torch.max(out, 1)[1]

        else: # INTERMEDIATE_MERGE_SIAMESE
          out = model(left_vec, right_vec)
          loss = loss_fn(out, label)
          output_labels = torch.max(out, 1)[1]
        
        #out = model(left_vec, right_vec)
        #loss = loss_fn(out, label)
        #output_labels = torch.max(out, 1)[1]
        label = torch.squeeze(label)
        output_labels = torch.squeeze(output_labels)
        pred = output_labels.data.cpu().numpy()
        target = label.data.cpu().numpy()

        if temp:
          #print('OUT: ', output_labels)
          print('OUT2: ', output_labels.shape)
          print('OUT3: ', label.shape)
          #print('OUTPUT: ', out.shape)
          print('OUT_ONES: ', count(output_labels))
          print('OUT_LABELS: ', count(label))
          print('TORCH: ', float((pred == target).sum()))
          temp = False
  
        old_val_acc = val_acc
        val_len += label.shape[0]
        val_acc += float((pred == target).sum())
        
        val_loss += loss.item()

    #print('VAL:', val_loss)
    print(f'Epoch {epoch+0:03}: | Train Loss: {train_loss/train_len:.5f} | Val Loss: {val_loss/val_len:.5f} | Val Acc: {val_acc/val_len:.3f}')
    torch.cuda.empty_cache()
    val_loss_arr.append(val_loss/val_len)

  plt.figure(figsize=(10,5))
  plt.title("Training and Validation Loss")
  plt.plot(val_loss_arr,label="val")
  plt.plot(train_loss_arr,label="train")
  plt.xlabel("iterations")
  plt.ylabel("Loss")
  plt.legend()
  plt.show()
  logger.close()
  torch.save({'state_dict': model.state_dict()}, os.path.join(HOME, 'fusional_snn.pt'))
  print('TRAINING DONE')
  test(model, device, test_loader, arch, contra_loss)
  
      
if __name__ == '__main__':
   LOG_DIR = 'logs'
   HOME = '/drive/Mydrive'
   SAVE_DIR = 'save'
   TIME = time.strftime("%Y%m%d_%H%M%S")
   device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def test(model, device, test_loader, arch, contra_loss):
  model.eval().to(device)
  y = {'Actual': [], 'Predicted': []}
  with torch.no_grad():
    for left_vec, right_vec, label in test_loader:
      left_vec, right_vec = torch.unsqueeze(left_vec, 1), torch.unsqueeze(right_vec, 1)
      left_vec = left_vec.to(device)
      right_vec = right_vec.to(device)
      label = label.to(device)
      
      if arch == 1:
        out1, out2 = model(left_vec, right_vec)
        if contra_loss:
          output_labels = threshold_contrastive(out1, out2)
        else:
          eucledian_distance = F.pairwise_distance(out1, out2)
          output_labels = torch.sigmoid(eucledian_distance)
      elif arch == 2:
        cat_vec = torch.cat((left_vec, right_vec), 2)
        out = model(cat_vec)
        output_labels = torch.max(out, 1)[1]
      else:
        out = model(left_vec, right_vec)
        output_labels = torch.max(out, 1)[1]
      
      #out = model(left_vec, right_vec)
      #output_labels = torch.max(out, 1)[1]
      label = torch.squeeze(label)
      output_labels = torch.squeeze(output_labels)
      pred = output_labels.data.cpu().numpy()
      target = label.data.cpu().numpy()

      y['Actual'].extend(target.tolist())
      y['Predicted'].extend(pred.tolist())

  print('\n f1 Score= %.4f' % f1_score(y['Actual'], y['Predicted']))
  print('Precision= %.4f' % precision_score(y['Actual'], y['Predicted'], zero_division=0))
  print(' Recall= %.4f' % recall_score(y['Actual'], y['Predicted'])) 
 
  print('\nAccuracy: %.4f' % accuracy_score(y['Actual'], y['Predicted'])) 

Main

In [None]:
if __name__ == '__main__':
  csv_path = '/content/drive/MyDrive/Code Clone Detection/syntax_semantic.csv'
  left_numpy, right_numpy, labels = get_csv_data(csv_path)

In [None]:
CUDA_LAUNCH_BLOCKING=1

"""
Late Merge Siamese : 1
No Siamese : 2
Intermediate Merge Siamese: 3

"""

train(left_numpy, right_numpy, labels, 1)

-----------------BEGIN TRAINING-------------------


EXTRA

In [None]:
def extra(labels):
  ones, zeros = 0,0
  for i in labels:
    if int(i[0]) == 1:
      ones+=1
    else:
      zeros+=1
  return ones, zeros
one, zero = extra(labels)
print('ones: ', one)
print('zeros: ', zero)

ones:  150100
zeros:  150101
