In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!nvidia-smi

In [None]:
!pip install scikit-multilearn

In [None]:
!pip install transformers

In [None]:
import spacy
import string
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.metrics import classification_report
import requests
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
import sklearn.metrics as skm
from sklearn import metrics
import torch
import torch.nn as nn
import numpy as np
import shutil
import sys   
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
nlp.pipe_names

In [None]:
def read_convert_data(api_url):
  get_data = requests.get(api_url)
  data = get_data.json()
  dialog_idx = []
  response = []
  original_response = []
  history = []
  knowledge = []
  Begin = []
  vrm = []
  headers = []
  for i in data["rows"]:
    for key,value in i.items():
      if type(value)!=int and type(value)!=list:
        for k,v in value.items():
          if k not in headers:
            headers.append(k)
          if k == "dialog_idx":
            dialog_idx.append(v)
          if k == "response":
            response.append(v)
          if k == "original_response":
            original_response.append(v)
          if k == "history":
            history.append(v)
          if k == "knowledge":
            knowledge.append(v)
          if k == "BEGIN":
            Begin.append(v)
          if k == "VRM":
            vrm.append(v)
  full_data = list(zip(dialog_idx,response,original_response,history,knowledge,Begin,vrm))
  testing_data = pd.DataFrame(full_data, columns=headers)
  return testing_data

In [None]:
def clean_data(data):
   for i in range(0,len(data)):
      #Convert text to lower
      if type(data.iloc[i]) == float:
         data.iloc[i]= str(data.iloc[i])
      data.iloc[i] = data.iloc[i].lower()
      #Tokenize the data using spacy
      doc = nlp(data.iloc[i])
      #Convert data to lower using spacy
      tokens = [tokens.lower_ for tokens in doc]
      #remove stop words
      tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
      #remove Punctuation
      tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
      #lemmatize the data
      final_token = [token.lemma_ for token in tokens]
      #generate the cleaned text
      data.iloc[i] = " ".join(final_token)
   return data
   

In [None]:
def label_data(new_df, columns):
  mlb = MultiLabelBinarizer()
  mlb_df = mlb.fit_transform(new_df[columns].to_numpy())
  df_ohe = pd.DataFrame(mlb_df,new_df.index, mlb.classes_)
  final_df = pd.concat([new_df,df_ohe], axis=1)
  return final_df

In [None]:
def remove_duplicates(history):
  blist = [j for i in history for j in i]
  alist = [] 
  for i in blist:
    if i not in alist:
      alist.append(i)
  return alist

In [None]:
def generate_seeker(dataset):
  resp_hist = dataset[["response", "history"]]
  response = resp_hist["response"].to_numpy()
  history =  resp_hist["history"].to_numpy()
  alist = remove_duplicates(history)
  seeker= [i for i in alist if i not in response]
  df = pd.DataFrame({'seeker':seeker})
  new_df = pd.concat([dataset, df],axis =1)
  return new_df

In [None]:
trainUrl = read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=validation")
train_seeker = generate_seeker(trainUrl)
final_train_1 = label_data(train_seeker, "BEGIN")
final_train = label_data(final_train_1, "VRM")
final_train[:3]

In [None]:
testUrl=read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=test")
test_seeker = generate_seeker(testUrl)
final_test_1 = label_data(test_seeker, "BEGIN")
final_test = label_data(final_test_1, "VRM")
final_test[:3]

In [None]:
clean_data(final_train["knowledge"])
clean_data(final_train["response"])
clean_data(final_train["seeker"])

In [None]:
clean_data(final_test["knowledge"])
clean_data(final_test["response"])
clean_data(final_test["seeker"])

In [None]:
X_train = final_train[["knowledge", "seeker", "response"]]
X_test = final_test[["knowledge", "seeker", "response"]]
y_train = final_train[["Entailment","Generic","Hallucination","Uncooperative"]]
y_test = final_test[["Entailment","Generic","Hallucination","Uncooperative"]]

In [None]:
#Pipeline
model = BinaryRelevance(MultinomialNB())
k_vect = TfidfVectorizer()
s_vect = TfidfVectorizer()
r_vect = TfidfVectorizer()
c_transform = ColumnTransformer([('tfidf_k', k_vect, 'knowledge'),('tfidf_s', s_vect, 'seeker'),('tfidf_r', r_vect, 'response')], remainder='passthrough')
pipe = Pipeline([('tfidf', c_transform),('classify', model)])
pipe.fit(X_train,y_train)
res = pipe.predict(X_test)

In [None]:
report = classification_report(y_test , res, target_names=["Entailment","Generic","Hallucination","Uncooperative"])
cnf_matrix = skm.multilabel_confusion_matrix(y_test, res)
print(skm.classification_report(y_test,res))

In [None]:
import torch
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# BERT IMPLEMENTATION
---


In [None]:
trainUrl = read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=validation")
train_seeker = generate_seeker(trainUrl)
final_train_1 = label_data(train_seeker, "BEGIN")
final_train = label_data(final_train_1, "VRM")
final_train[:3]

In [None]:
target_list = ['Entailment', 'Generic',	'Hallucination',	'Uncooperative']

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
final_train["CONTEXT"] = final_train["knowledge"] + ". " + final_train["seeker"] + ". " + final_train["response"]
final_test["CONTEXT"] = final_test["knowledge"] + ". " + final_test["seeker"] + ". " + final_test["response"]


In [None]:
final_train_copy = final_train
final_test_copy = final_test

In [None]:
final_train.drop(["dialog_idx","response","original_response","history","knowledge","BEGIN","VRM","seeker", "Ack.",
       "Advisement", "Disclosure", "Edification", "Question"],axis=1, inplace=True)

In [None]:
final_train.columns

In [None]:
final_train = final_train[['CONTEXT', 'Entailment', 'Generic', 'Hallucination', 'Uncooperative']]

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 1e-05

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['CONTEXT']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }
     


In [None]:
train_ds = CustomDataset(final_train, tokenizer, MAX_LEN)
test_ds = CustomDataset(final_test, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_ds, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(test_ds, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
train_data_loader

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 4)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets=[]
val_outputs=[]

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [None]:
ckpt_path = "/content/curr_ckpt"
best_model_path = "/content/best_model.pt"

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer,ckpt_path, best_model_path)

In [None]:
model.eval()

In [None]:
text_Url = read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=validation")
train_seeker_1 = generate_seeker(text_Url)
final_train_1 = label_data(train_seeker_1, "BEGIN")
final_TEST = label_data(final_train_1, "VRM")
final_TEST[10:16]

In [None]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(val_data_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = skm.accuracy_score(targets, outputs)
f1_score_micro = skm.f1_score(targets, outputs, average='micro')
f1_score_macro = skm.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

# **Text Generation**

In [None]:
# !pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import AutoTokenizer

tokenizer_qa = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
trainUrl1 = read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=validation")
train_seeker1 = generate_seeker(trainUrl1)
final_train_11 = label_data(train_seeker1, "BEGIN")
final_train1 = label_data(final_train_11, "VRM")
final_train1[:3]

In [None]:
testUrl1 =read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=test")
test_seeker1 = generate_seeker(testUrl1)
final_test_11 = label_data(test_seeker1, "BEGIN")
final_test1 = label_data(final_test_11, "VRM")
final_test1[:3]

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
train_df = Dataset.from_pandas(final_train1)
test_df = Dataset.from_pandas(final_test1)

full_data = DatasetDict()
full_data['train'] = train_df
full_data['test'] = test_df

In [None]:
def preprocess_function(examples):
    seekers = [q.strip() for q in examples["seeker"]]
    inputs = tokenizer(
        seekers,
        examples["knowledge"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    responses = examples["response"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        response = responses[i]
        start_char = 0
        end_char = len(response)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = full_data.map(preprocess_function, batched=True, remove_columns=full_data["train"].column_names)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
training_args = TrainingArguments(
    output_dir="my_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model("my_qa_model")

In [None]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 2
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [None]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["test"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="my_awesome_qa_model",
    tokenizer=tokenizer,
)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2)

In [None]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
question_answerer(question=question, context=context)

In [None]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
with torch.no_grad():
    outputs = model(**inputs)

In [None]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)