In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
Main_dir = '/content/drive/MyDrive/NLP/GAN_for_text_generation/'
Data_dir = '/content/drive/MyDrive/NLP/GAN_for_text_generation/Software_engineering_Datasets/SentimentAnalysis/'
Data_dir_replication = '/replication/dataset/'

In [4]:
Data_files_names = ['oracle.xlsx', 'BenchmarkUddinSO-ConsoliatedAspectSentiment.xls', 'github_gold.csv','AppReviews.csv', 'JIRA.csv', 'StackOverflow.csv']
Data_files = {'CR': Data_dir + Data_files_names[0],
              'API': Data_dir + Data_files_names[1],
              'SO': Data_dir + Data_dir_replication  + Data_files_names[5],
              'JIRA': Data_dir + Data_dir_replication  + Data_files_names[4],
              'AppReviews':Data_dir + Data_dir_replication  + Data_files_names[3]}


In [5]:
import os
os.chdir(Main_dir)
!pwd


/content/drive/MyDrive/NLP/GAN_for_text_generation


In [6]:
import os
import os.path
import json
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import lr_scheduler

import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

# Load Data

In [7]:
data = pd.read_csv(Data_files['SO'])
data.shape

(1500, 3)

In [8]:
data['oracle'].value_counts()

 0    1191
-1     178
 1     131
Name: oracle, dtype: int64

In [9]:
data['oracle_new'] = data['oracle'] + 1

In [10]:
data.head()

Unnamed: 0,id,text,oracle,oracle_new
0,6,But sadly this is not working.,-1,0
1,78,"So, everything builds fine, but when we try to...",-1,0
2,90,That is what is causing your null pointer exce...,-1,0
3,139,"All attempts I've made were, in a shortcut, un...",-1,0
4,162,Don't use.,-1,0


In [11]:
data['oracle_new'].value_counts()

1    1191
0     178
2     131
Name: oracle_new, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['oracle_new'],
                                                    stratify= data['oracle_new'], 
                                                    test_size=0.25)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1125,) (375,) (1125,) (375,)


In [13]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([134, 893,  98]))

In [14]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2]), array([ 44, 298,  33]))

# Set the hyperparameters

In [37]:
train_maxlen = 140
dev_maxlen = 140
batch_size = 20 #16
epochs = 10 # 10
bert_model = 'bert-base-uncased'
learning_rate = 3e-5

In [38]:
class Tokenize_dataset:
  """
  This class tokenizes the dataset using bert tokenizer
  """

  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    targets = self.targets[item]
    """
    Using encode_plus instead of encode as it helps to provide additional information that we need
    """
    inputs = self.tokenizer.encode_plus(
        str(text),
        add_special_tokens = True,
        max_length = self.max_len,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long)
    }

In [39]:
def loss_function(outputs, targets):
	"""
	This function defines the loss function we use in the model which since is multiclass is crossentropy
	"""
	return nn.CrossEntropyLoss()(outputs, targets)

In [40]:
def train_function(data_loader, model, optimizer, device):
  """
  Function defines the training that we will happen over the entire dataset
  """
  model.train()

  running_loss = 0.0
  """
  looping over the entire training dataset
  """
  for i, data in enumerate(data_loader):
    mask = data["mask"].to(device, dtype=torch.long)
    ids = data["ids"].to(device, dtype=torch.long)
    token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
    target = data["targets"].to(device, dtype=torch.long)
    optimizer.zero_grad()

    output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    # print(output, target)
    loss = loss_function(output, target)
    loss.backward()
    optimizer.step()
    """
    calculating loss and running loss
    """
    running_loss += loss.item()
    if i % 10 == 0 and i!=0:
      temp = f'Batch index = {i}\tRunning Loss = {running_loss/10}'
      print(temp)
      running_loss = 0.0

In [41]:
def eval_function(data_loader, model, device):
  """
  This function defines the loop over the dev set.
  """
  model.eval()
  correct_labels = 0
  tot = 0
  """
  no_grad as this is evaluation set and we dont want the model to update weights
  """
  with torch.no_grad():
    for i, data in enumerate(data_loader):
      mask = data["mask"].to(device, dtype=torch.long)
      ids = data["ids"].to(device, dtype=torch.long)
      token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
      targets = data["targets"].to(device, dtype=torch.long)
      outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

      max_probs, predicted = torch.max(outputs, 1)
      tot = tot + targets.size(0)
      correct_labels = correct_labels + torch.sum(predicted==targets)

      print(f"Batch Index: {i}\tPredicted: {predicted}\tTargets: {targets}")
    """
    basic metrics for accuracy calculation
    """
    accuracy = correct_labels / tot * 100
    print(accuracy)
  return accuracy

In [42]:
class CompleteModel(nn.Module):
  """
  The model architecture is defined here which is a fully connected layer + normalization on top of a BERT model
  """

  def __init__(self, bert):
    super(CompleteModel, self).__init__()
    self.bert = BertModel.from_pretrained(bert)
    self.drop = nn.Dropout(p=0.25)
    self.out = nn.Linear(self.bert.config.hidden_size, 3) # Number of output classes = 3, positive, negative and N(none)

  def forward(self, ids, mask, token_type_ids):
    _, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
    output = self.drop(pooled_output)
    return self.out(output)

In [43]:
def run(train_text, train_target):
  # training_set_path = "/content/drive/MyDrive/NLP_disaster/train.csv"
  #   #validation_set_path = '/content/drive/MyDrive/dataset/dev/' + str(location) + '_' + str(aspect) + '.csv'
  # df_train = pd.read_csv(training_set_path)
    #df_valid = pd.read_csv(validation_set_path)
      
  # df_train['target'] = df_train['target']
   # df_valid['target'] = df_valid['target'].map(sentiment_mapping)
  # df_train = df_train.reset_index(drop=True)
   # df_valid = df_valid.reset_index(drop=True)
  tokenizer = BertTokenizer.from_pretrained(bert_model)
  train_dataset = Tokenize_dataset(
        text = train_text.values,
        targets = train_target.values,
        tokenizer = tokenizer,
        max_len = train_maxlen
  )
  class_counts = []
  unique_labels = np.unique(y_train.values)
  for i in unique_labels:
    class_counts.append(train_text.values[train_target.values==i].shape[0])
  print(f"Class Counts: {class_counts}")
      
  num_samples = sum(class_counts)
  print(num_samples)
  labels = train_target.values
  class_weights = {}
  for i in range(len(class_counts)):
      if class_counts[i] != 0:
          class_weights[unique_labels[i]] = (num_samples/class_counts[i])
      else:
          class_weights[unique_labels[i]] = 0
  print('Class weights', class_weights)
  weights = [class_weights[labels[i]] for i in range(int(num_samples))]
  sampler = torch.utils.data.sampler.WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
  train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = batch_size,
        shuffle = False,
        sampler = sampler
    )
  #valid_dataset = Tokenize_dataset(
    #    text = df_valid['text'].values,
     #   targets = df_valid['sentiment'].values,
    #   tokenizer = tokenizer,
     #   max_len = dev_maxlen
   # )
   # valid_data_loader = torch.utils.data.DataLoader(
    #    valid_dataset,
     #   batch_size = batch_size,
   #     shuffle = False
#    )
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")
  model = CompleteModel(bert_model).to(device)
  optimizer = AdamW(model.parameters(), lr=learning_rate)
  scheduler = lr_scheduler.StepLR(
        optimizer,
        step_size = 1,
        gamma = 0.8
    )
  for epoch in range(epochs):
    train_function(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device)
        #accuracy = eval_function(data_loader=valid_data_loader, model=model, device=device, location=location, aspect=aspect)
    print("\nEpoch = "+ str(epoch))
    print("\nLearning Rate = " + str(scheduler.get_lr()[0])+"\n")
    scheduler.step()
    torch.save(model, Main_dir + '/Models/'+ '/'+ str(epoch) + '.bin')
  


# Train

In [44]:
run(X_train, y_train)

Class Counts: [134, 893, 98]
1125
Class weights {0: 8.395522388059701, 1: 1.2597984322508398, 2: 11.479591836734693}
Device: cuda:0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

Batch index = 10	Running Loss = 1.1949561715126038
Batch index = 20	Running Loss = 1.0266030788421632
Batch index = 30	Running Loss = 0.9307244122028351
Batch index = 40	Running Loss = 0.7963219523429871
Batch index = 50	Running Loss = 0.6513383984565735

Epoch = 0

Learning Rate = 3e-05

Batch index = 10	Running Loss = 0.4502914726734161
Batch index = 20	Running Loss = 0.29302823543548584
Batch index = 30	Running Loss = 0.26442293301224706
Batch index = 40	Running Loss = 0.2696003369987011
Batch index = 50	Running Loss = 0.15263351425528526

Epoch = 1

Learning Rate = 1.9200000000000003e-05

Batch index = 10	Running Loss = 0.20274244770407676
Batch index = 20	Running Loss = 0.143890193849802
Batch index = 30	Running Loss = 0.10856764428317547
Batch index = 40	Running Loss = 0.0875805677846074
Batch index = 50	Running Loss = 0.08162803947925568

Epoch = 2

Learning Rate = 1.5360000000000002e-05

Batch index = 10	Running Loss = 0.08434091433882714
Batch index = 20	Running Loss = 0.07323

# Test

In [47]:
from sklearn.metrics import classification_report

In [50]:
MAX_LEN = 140

tokenizer = BertTokenizer.from_pretrained(bert_model)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# df = pd.read_csv("/content/drive/MyDrive/NLP_disaster/test.csv")
for epoch in range(epochs-1):

  result = []
  idees = []
  model = torch.load(Main_dir + '/Models/'+ '/'+ str(epoch) + '.bin')
  for i in range(len(X_test)):
    id_test = X_test.keys()[i]
    text = X_test.values[i]


    inputs = tokenizer.encode_plus(
            str(text),
            add_special_tokens = True,
            max_length = MAX_LEN,
            pad_to_max_length = True,
        )
    ids = torch.tensor(inputs["input_ids"], dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).unsqueeze(0)

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)

  #model = models_set[f"{location}{aspect}"]
    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    prob_max, predicted = torch.max(outputs, 1)

    predicted = predicted.detach().cpu().numpy()

          # Add the predicted to the json only if it is not N(none)
          # Reverse mapping from numbers to sentiments
    idees.append(id_test)
    result.append(predicted[0])
    # 1 positive, 0 neutral, -1 negative 
  target_names = ['Class Negative', 'Class Neutral', 'Class Positive']
  print('epoch', epoch)
  print(classification_report(y_test, result,target_names = target_names))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


epoch 0
                precision    recall  f1-score   support

Class Negative       0.62      0.52      0.57        44
 Class Neutral       0.93      0.72      0.81       298
Class Positive       0.22      0.73      0.34        33

      accuracy                           0.70       375
     macro avg       0.59      0.66      0.57       375
  weighted avg       0.83      0.70      0.74       375

epoch 1
                precision    recall  f1-score   support

Class Negative       0.63      0.66      0.64        44
 Class Neutral       0.93      0.89      0.91       298
Class Positive       0.50      0.67      0.57        33

      accuracy                           0.84       375
     macro avg       0.69      0.74      0.71       375
  weighted avg       0.86      0.84      0.85       375

epoch 2
                precision    recall  f1-score   support

Class Negative       0.65      0.70      0.67        44
 Class Neutral       0.93      0.90      0.91       298
Class Positive   