In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
!pip install sentencepiece
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import numpy as np
import pandas as pd

In [None]:
pip install openpyxl

In [None]:
traind = pd.read_excel("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/nlpaug_0augmented_train.xlsx",engine='openpyxl')
vald = pd.read_excel("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/nlpaug_0augmented_val.xlsx",engine='openpyxl')
testd = pd.read_excel("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/nlpaug_0augmented_test.xlsx",engine='openpyxl')

In [None]:
traind.head()

In [None]:
# Get the lists of sentences and their labels.
train_sentences = traind.Review.values
train_labels = traind.Label.values
val_sentences = vald.Review.values
val_labels = vald.Label.values
test_sentences = testd.Review.values
test_labels = testd.Label.values

# 3. Tokenization & Input Formatting

In this section, we'll transform our dataset into the format that BERT can be trained on.

In [None]:
from transformers import BertTokenizer, AutoTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained('neuropark/sahajBERT')

In [None]:
# Print the original sentence.
print(' Original: ', train_sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sentences[0])))

In [None]:
max_len = 0

# For every sentence...
for sent in train_sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
train_input_ids = []
train_attention_masks = []
val_input_ids = []
val_attention_masks = []
test_input_ids = []
test_attention_masks = []
# For every sentence...
for sent in train_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )   
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)
for sent in val_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )   
    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(val_labels)
for sent in test_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )   
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_labels = torch.tensor(test_labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', test_sentences[0])
print('Token IDs:', test_input_ids[0])

## 3.4. Training & Validation Split


In [None]:
pip install -U scikit-learn scipy matplotlib

In [None]:
from torch.utils.data import TensorDataset, random_split
from sklearn.model_selection import train_test_split
       
train_dataset=TensorDataset(torch.from_numpy(train_input_ids.numpy()),torch.from_numpy(train_attention_masks.numpy()),torch.from_numpy(train_labels.numpy()))
val_dataset=TensorDataset(torch.from_numpy(val_input_ids.numpy()),torch.from_numpy(val_attention_masks.numpy()),torch.from_numpy(val_labels.numpy()))
test_dataset=TensorDataset(torch.from_numpy(test_input_ids.numpy()),torch.from_numpy(test_attention_masks.numpy()),torch.from_numpy(test_labels.numpy()))
train_dataset[0:5]

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 2

train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size,drop_last=True 
        )
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size,drop_last=True
        )
test_dataloader = DataLoader(
            test_dataset, 
            sampler = SequentialSampler(test_dataset), .
            batch_size = batch_size ,drop_last=True
        )

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig, AutoModel,AutoModelForPreTraining

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = AutoModel.from_pretrained(
    "neuropark/sahajBERT", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, 
                  eps = 1e-8 
                )


In [None]:
from transformers import get_linear_schedule_with_warmup
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import numpy as np
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    print(pred_flat)
    print(labels_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
training_stats = []

In [None]:
pip install normalizer

In [None]:
import random
import numpy as np
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()       
        result = model(b_input_ids.cuda(), 
                       #token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       #labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval() 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            result = model(b_input_ids, 
                           #token_type_ids=None, 
                           attention_mask=b_input_mask,
                           #labels=b_labels,
                           return_dict=True)
        loss = result.loss
        logits = result.logits
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
total_test_accuracy=0
logits_l=[]
y_true_l=[]
for batch in test_dataloader:
  print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)
  result = model(b_input_ids, 
    token_type_ids=None, 
    attention_mask=b_input_mask, 
    labels=b_labels,
    return_dict=True)

  logits = result.logits
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  logits_l.extend(logits)
  y_true_l.extend(label_ids)
  total_test_accuracy += flat_accuracy(logits, label_ids)
avg_test_accuracy = total_test_accuracy / len(test_dataloader)
print("  Accuracy: {0:.2f}".format(avg_test_accuracy))

In [None]:
import pandas as pd
df_stats = pd.DataFrame(data=training_stats)

df_stats = df_stats.set_index('epoch')

df_stats

In [None]:
y_true=np.array(y_true_l)
classes_xy = np.argmax(logits_l, axis=1).flatten()

In [None]:
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

from sklearn.metrics import roc_auc_score
x=roc_auc_score(y_true,classes_xy )

from sklearn.metrics import matthews_corrcoef
mcc=matthews_corrcoef(y_true,classes_xy)

from sklearn.metrics import confusion_matrix
conf=confusion_matrix(y_true, classes_xy)
cr=classification_report(y_true, classes_xy,digits=4,output_dict=True)

In [None]:
training_stats_report=[]
training_stats_report.append(
        {
            'roc-auc': x,
            'mcc': mcc,
            'conf_0_0': conf[0][0],
            'conf_0_1': conf[0][1],
            'conf_1_0': conf[1][0],
            'conf_1_1': conf[1][1],
            'cr': cr
        }
    )

In [None]:
id=4
modelname="sagar"

In [None]:
df = pd.DataFrame(training_stats_report) 
df2 = pd.DataFrame(training_stats) 
df3 = pd.DataFrame(logits)
df4 = pd.DataFrame(classes_xy,columns=['Pred'])
df4['True']=y_true
df5= pd.DataFrame(cr).transpose()
    
# saving the dataframe 
df.to_csv("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/performance"+str(id)+str(modelname)+".csv") 
df2.to_csv("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/history"+str(id)+str(modelname)+".csv") 
df3.to_csv("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/probab"+str(id)+str(modelname)+".csv") 
df4.to_csv("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/predict"+str(id)+str(modelname)+".csv") 
df5.to_csv("C:/Users/MSI/Desktop/BFRD/Final/nlpaug/cr"+str(id)+str(modelname)+".csv") 

In [None]:
import os
output_dir = './'+modelname+str(id)+'model_save/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('/content/drive/My Drive/Data712/BFModeltesting/')
tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Data712/BFModeltesting/')

In [None]:
!pip install lime 

In [None]:
import numpy as np
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

class_names = ['Fake','Real']
def predictor(texts):
  outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
  probas = F.softmax(outputs.logits).detach().numpy()
  print(probas)
  return probas

In [None]:
explainer = LimeTextExplainer(class_names=class_names, split_expression=' ')

str_to_predict = 'আচ্ছা বাম চোখ লাফালে কি বিপদ আসে সত্যিই আমার তো লাফাচ্ছিল'
#str_to_predict=tprstr_to_predict.cuda()
model.cpu()
exp = explainer.explain_instance(str_to_predict, predictor, num_features=40, num_samples=2000)
exp.show_in_notebook(text=str_to_predict)