In [1]:
import tensorflow as tf
# Getting GPU device name.
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import torch
# If a GPU is available
if torch.cuda.is_available():    
    #set device to GPU   
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If no GPU is available
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [3]:
!pip install transformers

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import seaborn        as sns

import transformers
from transformers                     import  RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from sklearn.metrics                  import classification_report
from sklearn.metrics                  import confusion_matrix
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.utils                    import class_weight

from torch                            import nn, optim
from torch.utils                      import data
from sklearn.decomposition            import PCA




In [4]:
#Seeding for deterministic results
RANDOM_SEED = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED) 
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False

CLASS_NAMES = ['0','1']
# CLASS_NAMES =['Non-ADU','ADU']


In [5]:
MAX_LENGTH = 200                                    
BATCH_SIZE = 16
EPOCHS = 5
HIDDEN_UNITS = 128

tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')  #Use roberta-large or roberta-base

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
train_df=pd.read_csv(F"/content/gdrive/My Drive/SAMBIT/master_thesis/data/train.tsv", sep='\t', header=None, index_col=False)
dev_df=pd.read_csv(F"/content/gdrive/My Drive/SAMBIT/master_thesis/data/dev.tsv", sep='\t', header=None, index_col=False)
test_df=pd.read_csv(F"/content/gdrive/My Drive/SAMBIT/master_thesis/data/test.tsv", sep='\t', header=None, index_col=False)

large_train_df=pd.read_csv(F"/content/gdrive/My Drive/SAMBIT/master_thesis/data/noisy_train.tsv", sep='\t', header=None, index_col=False)

In [8]:
# Preprocess

col = {0:"sentences", 1:"label0", 2:"label" }

train_df = train_df.rename(columns = col)
dev_df = dev_df.rename(columns = col)
test_df = test_df.rename(columns = col)
large_train_df = large_train_df.rename(columns = col)

train_df = train_df.drop(columns=['label0'])
dev_df = dev_df.drop(columns=['label0'])
test_df = test_df.drop(columns=['label0'])
large_train_df = large_train_df.drop(columns=['label0'])

In [9]:
#Creates a dataset which will be used to feed to RoBERTa
class BiasDataset(data.Dataset):
  def __init__(self,sentences, labels, tokenizer, max_len):

    # def __init__(self,firstSeq,sentences, labelValue,  tokenizer, max_len):
#     def __init__(self, firstSeq, secondSeq, sentences, labelValue,  tokenizer, max_len):
        # self.firstSeq    = firstSeq      #First input sequence that will be supplied to RoBERTa
        # self.secondSeq   = secondSeq     #Second input sequence that will be supplied to RoBERTa
        self.sentences = sentences   #Concatenation of reply+ previous+ src text to get features from 1 training example
        # self.Features = Features
        self.labels  = labels    #label value for each training example in the dataset
        self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
        self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
        return len(self.labels)

  def __getitem__(self, item):
        # firstSeq    = str(self.firstSeq[item])
        # secondSeq   = str(self.secondSeq[item])
        sentences = str(self.sentences[item])
        # Features = str(self.Features[item])

    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
        encoding = tokenizer.encode_plus(
            # firstSeq,
            # secondSeq,
            sentences,
            # Features,
            max_length = self.max_len,
            add_special_tokens= True,
            truncation = True,
            pad_to_max_length = True,
            # padding=True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return {
            # 'firstSeq' : firstSeq,
            # 'secondSeq' : secondSeq,
            'sentences': sentences,
            # 'Features' : Features,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels'  : torch.tensor(self.labels[item], dtype=torch.long)
        }


In [10]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
    ds = BiasDataset(
        # firstSeq    = dataframe.Topic.to_numpy(),
        # secondSeq   = dataframe.Topic.to_numpy(),
        sentences = dataframe.sentences.to_numpy(),
        # Features = dataframe.Features.to_numpy(),
        labels  = dataframe.label.to_numpy(),
        tokenizer   = tokenizer,
        max_len     = max_len
    )

    return data.DataLoader(
        ds,
        batch_size  = batch_size,
        shuffle     = True,
        num_workers = 2
    )

In [11]:
#Creating data loader for training data
trainDataLoader        = createDataLoader(train_df, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for development data
developmentDataLoader  = createDataLoader(dev_df, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(test_df, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [12]:
# train_df

In [12]:
#Instantiating the tf-idf vectorizer object
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))

x_train = train_df['sentences'].tolist()
y_train = train_df['label'].tolist()

x_train_feats = tfidf.fit(x_train)
print('x_train_feats: ',x_train_feats)
print('length: ',len(x_train_feats.get_feature_names()))

x_train_transform = x_train_feats.transform(x_train)
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print('x_train_transform.shape: ',x_train_transform.shape)

pca = PCA(n_components=128)
p = pca.fit(tfidf_transform_tensor)
# print(p.shape)
print(p)
X = p.transform(tfidf_transform_tensor)
# torch.from_numpy(X.values)
X = torch.from_numpy(X)
# tfidf_transform_tensor_pca = torch.tensor(scipy.sparse.csr_matrix.todense(X)).float()
print(X.type())
print(X.shape)
print(X)

x_train_feats:  TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
length:  2402




x_train_transform.shape:  (5028, 2402)
PCA(n_components=128)
torch.DoubleTensor
torch.Size([5028, 128])
tensor([[-0.0317, -0.0838, -0.0244,  ..., -0.0452, -0.0785, -0.0181],
        [-0.0253, -0.0745, -0.0191,  ..., -0.0204, -0.0470, -0.0255],
        [ 0.1365,  0.0091, -0.0550,  ...,  0.0352, -0.0274, -0.0181],
        ...,
        [ 0.0142,  0.0354,  0.0247,  ..., -0.0164, -0.0534, -0.0472],
        [-0.0094, -0.0126,  0.0431,  ...,  0.0109, -0.0463,  0.0152],
        [-0.0053, -0.0172,  0.0506,  ..., -0.0160, -0.0282,  0.0070]],
       dtype=torch.float64)


In [13]:
#This class defines the model that was used to pre-train a SNN on TF-IDF features
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names()), HIDDEN_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_UNITS, 3)
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #Returning the ouputs from the hidden layer and the final output layer
        return  y, z

In [14]:
snnmodel = Tfidf_Nn()

model_save_name = 'bias_classify_roberta_tfidf.pt'
path = F"{model_save_name}"

# snnmodel.load_state_dict(torch.load(path))
snnmodel.eval()



Tfidf_Nn(
  (hidden): Linear(in_features=2402, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (tanh): Tanh()
  (softmax): Softmax(dim=1)
)

In [15]:
'''This class defines the model that will be used for 
training and testing on the dataset.

Adapted from huggingFace
This RoBERTa model from huggingface outputs the last hidden states
and the pooled output by default. Pooled output is the classification 
token (1st token of the last hidden state) further processed by a Linear
layer and a Tanh activation function.

The pre-trained RoBERTa model is used as the primary model.
This class experiments with RoBERTa and its ensemble with TF-IDF features. 
roberta-only :            No ensembling. This just fine-tunes the RoBERTa model. 
                          The pooled output is passed through a linear layer and 
                          softmax function is finally used for preictions. 

roberta-tfIdf :           This model conatenates the 1st token of last-hidden layer
                          from RoBERTa with TF-IDF features. Various ways of this 
                          concatenation was experimented (using pooled output instead
                          of 1st token of last hidden layer etc)

roberta-pcaTfidf :        This model concatenates the pooled output from
                          RoBERTa with the PCA transformed vector.

roberta-preTrainedTfIdf : This model concatenates the pooled output from
                          RoBERTa with the hidden layer output from a pre-trained
                          SNN that was trained on TF-IDF features.

Used dropout to prevent over-fitting.'''

class BiasClassifier(nn.Module):

    def __init__(self,  n_classes):
        super(BiasClassifier, self).__init__()
        self.robertaModel              = RobertaModel.from_pretrained('roberta-large')    #use roberta-large or roberta-base
        self.model_TFIDF               = snnmodel                                        #Pre-trained SNN trained with TF-IDF features

        self.drop                      = nn.Dropout(p = 0.3)

        self.output                    = nn.Linear(self.robertaModel.config.hidden_size, n_classes)

        self.input_size_tfidf_only     = self.robertaModel.config.hidden_size + len(tfidf.get_feature_names())
        self.input_size_tfidf_pca      = self.robertaModel.config.hidden_size + HIDDEN_UNITS

        self.dense                     = nn.Linear( self.input_size_tfidf_only,  self.input_size_tfidf_only)
        self.out_proj                  = nn.Linear( self.input_size_tfidf_only, n_classes)
        self.out_pca                   = nn.Linear( self.input_size_tfidf_pca, n_classes)

        self.input_size_preTrain_tfidf = self.robertaModel.config.hidden_size +  HIDDEN_UNITS 
        self.out                       = nn.Linear(self.input_size_preTrain_tfidf, n_classes)

        self.softmax                   = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask, inputs_tfidf_feats, pca_transformed_feats, modelType):
        roberta_output     = self.robertaModel(
            input_ids      = input_ids,               #Input sequence tokens
            attention_mask = attention_mask )         #Mask to avoid performing attention on padding tokens
    #print(roberta_output[1].shape)
        if modelType   == 'roberta-only':
            pooled_output = roberta_output[1]           #Using pooled output
            output        = self.drop(pooled_output)
            output        = self.output(output)

        elif modelType == 'roberta-tfIdf':
            soutput = roberta_output[1]#---------        experimenting with pooled output 
            #soutput = roberta_output[0][:, 0, :]        #taking <s> token (equivalent to [CLS] token in BERT)
            x       = torch.cat((soutput, inputs_tfidf_feats) , dim=1)
            x       = self.drop(x)
            output  = self.out_proj(x)

        elif modelType == 'roberta-pcaTfidf':
            soutput = roberta_output[1]
            x       = torch.cat((soutput, pca_transformed_feats) , dim=1)
            x       = self.drop(x)
            output  = self.out_pca(x)

        elif modelType == 'roberta-TrainedTfIdf':
            tfidf_hidddenLayer, tfidf_output = self.model_TFIDF(inputs_tfidf_feats)
            #print(tfidf_hidddenLayer.shape)
            #print(tfidf_output.shape)

          #Conactenating pooled output from RoBERTa with the hidden layer from the pre-trained SNN using TF-IDF features. 
          #pooled_output = torch.cat((roberta_output[1], tfidf_output) , dim=1)-------- Experimenting with Output of pre-trained SNN 
            pooled_output = torch.cat((roberta_output[1], tfidf_hidddenLayer) , dim=1)
            output        = self.drop(pooled_output)
            output        = self.out(output)

        return self.softmax(output)


In [16]:
#Instantiating a BiasClassifier object as our model and loading the model onto the GPU.
model = BiasClassifier(len(CLASS_NAMES))
model = model.to(device)
#print(model)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# print(model)

In [17]:
'''Using the same optimiser as used in BERT paper
with a different learning rate'''
optimizer = AdamW(model.parameters(), 
                  lr = 2e-6, 
                  # lr = 1e-5,
                  correct_bias= False)

totalSteps = len(trainDataLoader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps = totalSteps
)

'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
finally been used here'''

weights      = [0.8, 1.2]
# weights = class_weights
class_weights = torch.FloatTensor(weights)

# class_weights = class_weight.compute_class_weight(
#                                         class_weight = "balanced",
#                                         classes = np.unique(y_train),
#                                         y = y_train                                                    
#                                     )
print(class_weights)


lossFunction = nn.CrossEntropyLoss(weight = class_weights).to(device)

tensor([0.8000, 1.2000])




In [18]:
#This function is used for training the model with 'roberta-TrainedTfIdf'. 
def train_epoch(
  model,
  dataLoader,
  lossFunction,
  optimizer,
  device,
  scheduler,
  n_examples
):
    model = model.train()
    losses = []
    correctPredictions = 0

    for d in dataLoader:
    
        input_ids              = d["input_ids"].to(device)                           #Loading input ids to GPU
        attention_mask         = d["attention_mask"].to(device)                      #Loading attention mask to GPU
        labels            = d["labels"].to(device)                          #Loading label value to GPU
        sentences            = d["sentences"]
        # Features               = d["Features"]                                  
        tfidf_transform        = x_train_feats.transform(sentences)
        tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()   
        pca_tensor             = p.transform(tfidf_transform_tensor)

        pca_tensor = torch.from_numpy(pca_tensor).float()
        pca_tensor = pca_tensor.to(device)
        tfidf_transform_tensor = tfidf_transform_tensor.to(device)

        #Getting the output from our model (Object of StanceClassification class) for train data
        outputs = model(
          input_ids             = input_ids,
          attention_mask        = attention_mask,
          inputs_tfidf_feats    = tfidf_transform_tensor,
          pca_transformed_feats = pca_tensor,
          modelType             = 'roberta-TrainedTfIdf'
        )

        #Determining the model predictions
        _, predictionIndices = torch.max(outputs, dim=1)
        loss = lossFunction(outputs, labels)

        #Calculating the correct predictions for accuracy
        correctPredictions += torch.sum(predictionIndices == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses), correctPredictions.double() / n_examples

In [19]:
#This function is used for evaluating the model on the development and test set
def eval_model(
    model, 
    dataLoader, 
    lossFunction,
    device,
    n_examples
    ):
    model = model.eval()
    losses = []
    correctPredictions = 0

    with torch.no_grad():
        for d in dataLoader:
            input_ids              = d["input_ids"].to(device)                          #Loading input ids to GPU
            attention_mask         = d["attention_mask"].to(device)                     #Loading attention mask to GPU
            labels            = d["labels"].to(device)                         #Loading label values to GPU
            sentences            = d["sentences"]
            tfidf_transform        = x_train_feats.transform(sentences)
            tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()    

            pca_tensor             = p.transform(tfidf_transform_tensor)

            pca_tensor = torch.from_numpy(pca_tensor).float()
            pca_tensor = pca_tensor.to(device)
            tfidf_transform_tensor = tfidf_transform_tensor.to(device)

            #Getting the softmax output from model for dev data
            outputs = model(
            input_ids             = input_ids,
            attention_mask        = attention_mask,
            inputs_tfidf_feats    = tfidf_transform_tensor,
            pca_transformed_feats = pca_tensor,
            modelType             = 'roberta-pcaTfidf'
            )

            #Determining the model predictions
            _, predictionIndices = torch.max(outputs, dim=1)
            loss = lossFunction(outputs, labels)

            #Calculating the correct predictions for accuracy
            correctPredictions += torch.sum(predictionIndices == labels)
            losses.append(loss.item())

    return np.mean(losses), correctPredictions.double() / n_examples


In [20]:
#fine tuning ROBERTa and validating it 

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}')
    trainLoss, trainAccuracy = train_epoch(
        model,
        trainDataLoader,
        lossFunction,
        optimizer,
        device,
        scheduler,
        len(train_df)
      )
    print(f'Training loss {trainLoss} Training accuracy {trainAccuracy}')
    devLoss, devAccuracy = eval_model(
        model,
        developmentDataLoader,
        lossFunction,
        device,
        len(dev_df)
      )
    print(f'Development loss {devLoss} Development accuracy {devAccuracy}')
    # from google.colab import drive
    # drive.mount('/content/gdrive')
    
    # model_save_name = f'Roberta_tfidf_{epoch}.pt'
    # path = F"/content/gdrive/My Drive/{model_save_name}" 
    # torch.save(model.state_dict(), path)
    print()
    print()

Epoch 1




Training loss 0.6942259353304666 Training accuracy 0.5065632458233891




KeyboardInterrupt: ignored

In [21]:
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

    model = model.eval()
    review_texta = []
#     review_textb = []               #     !! Change - commented
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            sentences                 = d["sentences"]
#             textbs                 = d["secondSeq"]
            input_ids              = d["input_ids"].to(device)
            attention_mask         = d["attention_mask"].to(device)
            labels                 = d["labels"].to(device)
            # Features            = d["Features"]
            tfidf_transform        = tfidf.transform(sentences)
            tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

            pca_tensor             =  p.transform(tfidf_transform_tensor)

            pca_tensor = torch.from_numpy(pca_tensor).float()
            pca_tensor = pca_tensor.to(device)
            tfidf_transform_tensor = tfidf_transform_tensor.to(device)

            #Getting the softmax output from model
            outputs = model(
                input_ids             = input_ids,
                attention_mask        = attention_mask,
                inputs_tfidf_feats    = tfidf_transform_tensor,
                pca_transformed_feats = pca_tensor,
                modelType             = 'roberta-TrainedTfIdf'
                )
            _, preds = torch.max(outputs, dim=1)     #Determining the model predictions

            review_texta.extend(sentences)
#             review_textb.extend(textbs)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(labels)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
  
    return review_texta, predictions, prediction_probs, real_values
#    return review_texta, review_textb, predictions, prediction_probs, real_values

In [23]:
#Getting model predictions on dev dataset
# firstSeq_dev, secondSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
#   model,
#   developmentDataLoader
# )

firstSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  model,
  developmentDataLoader
)



In [24]:
 #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES))
print(confusion_matrix(yTest_dev, yHat_dev))

              precision    recall  f1-score   support

           0       0.83      0.02      0.03       635
           1       0.41      1.00      0.58       431

    accuracy                           0.41      1066
   macro avg       0.62      0.51      0.30      1066
weighted avg       0.66      0.41      0.25      1066

[[ 10 625]
 [  2 429]]


In [None]:
# model_save_name = 'RoBERTaLarge_TFIDFV2P_cmv_step2.pt'
# path = F"/content/gdrive/My Drive/Colab Notebooks/{model_save_name}" 
# torch.save(model.state_dict(), path)

In [61]:
#Getting model predictions on test dataset
firstSeq_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)



In [62]:
#Printing classification report for test dataset (Evaluating the model on test set)
print(classification_report(yTest_test, yHat_test, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

           0       0.59      0.98      0.74      1252
           1       0.35      0.02      0.03       852

    accuracy                           0.59      2104
   macro avg       0.47      0.50      0.39      2104
weighted avg       0.50      0.59      0.45      2104



In [63]:
print(confusion_matrix(yTest_test, yHat_test))

[[1226   26]
 [ 838   14]]
