In [None]:
!pip install scikit-multilearn datasets transformers --quiet

In [None]:
import spacy
import string
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.metrics import classification_report
import requests
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
import sklearn.metrics as skm
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
nlp.pipe_names

In [None]:
def read_convert_data(api_url):
  get_data = requests.get(api_url)
  data = get_data.json()
  dialog_idx = []
  response = []
  original_response = []
  history = []
  knowledge = []
  Begin = []
  vrm = []
  headers = []
  for i in data["rows"]:
    for key,value in i.items():
      if type(value)!=int and type(value)!=list:
        for k,v in value.items():
          if k not in headers:
            headers.append(k)
          if k == "dialog_idx":
            dialog_idx.append(v)
          if k == "response":
            response.append(v)
          if k == "original_response":
            original_response.append(v)
          if k == "history":
            history.append(v)
          if k == "knowledge":
            knowledge.append(v)
          if k == "BEGIN":
            Begin.append(v)
          if k == "VRM":
            vrm.append(v)
  full_data = list(zip(dialog_idx,response,original_response,history,knowledge,Begin,vrm))
  testing_data = pd.DataFrame(full_data, columns=headers)
  return testing_data

In [None]:
def clean_data(data):
   for i in range(0,len(data)):
      #Convert text to lower
      if type(data.iloc[i]) == float:
         data.iloc[i]= str(data.iloc[i])
      data.iloc[i] = data.iloc[i].lower()
      #Tokenize the data using spacy
      doc = nlp(data.iloc[i])
      #Convert data to lower using spacy
      tokens = [tokens.lower_ for tokens in doc]
      #remove stop words
      tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
      #remove Punctuation
      tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
      #lemmatize the data
      final_token = [token.lemma_ for token in tokens]
      #generate the cleaned text
      data.iloc[i] = " ".join(final_token)
   return data
   

In [None]:
def label_data(new_df, columns):
  mlb = MultiLabelBinarizer()
  mlb_df = mlb.fit_transform(new_df[columns].to_numpy())
  df_ohe = pd.DataFrame(mlb_df,new_df.index, mlb.classes_)
  final_df = pd.concat([new_df,df_ohe], axis=1)
  return final_df

In [None]:
def remove_duplicates(history):
  blist = [j for i in history for j in i]
  alist = [] 
  for i in blist:
    if i not in alist:
      alist.append(i)
  return alist

In [None]:
def generate_seeker(dataset):
  resp_hist = dataset[["response", "history"]]
  response = resp_hist["response"].to_numpy()
  history =  resp_hist["history"].to_numpy()
  alist = remove_duplicates(history)
  seeker= [i for i in alist if i not in response]
  df = pd.DataFrame({'seeker':seeker})
  new_df = pd.concat([dataset, df],axis =1)
  return new_df

In [None]:
trainUrl = read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=validation")
train_seeker = generate_seeker(trainUrl)
final_train_1 = label_data(train_seeker, "BEGIN")
final_train = label_data(final_train_1, "VRM")

In [None]:
testUrl=read_convert_data("https://datasets-server.huggingface.co/first-rows?dataset=McGill-NLP%2FFaithDial&config=plain_text&split=test")
test_seeker = generate_seeker(testUrl)
final_test_1 = label_data(test_seeker, "BEGIN")
final_test = label_data(final_test_1, "VRM")

In [None]:
clean_data(final_train["knowledge"])
clean_data(final_train["response"])
clean_data(final_train["seeker"])

In [None]:
clean_data(final_test["knowledge"])
clean_data(final_test["response"])
clean_data(final_test["seeker"])

In [None]:
final_train

#BERT Classifier

In [None]:
X_train = final_train[["knowledge", "seeker", "response"]]
X_test = final_test[["knowledge", "seeker", "response"]]
y_train = final_train[["Entailment","Generic","Hallucination","Uncooperative"]]
y_test = final_test[["Entailment","Generic","Hallucination","Uncooperative"]]

In [None]:
#Pipeline
modelNb = BinaryRelevance(MultinomialNB())
k_vect = TfidfVectorizer()
s_vect = TfidfVectorizer()
r_vect = TfidfVectorizer()
c_transform = ColumnTransformer([('tfidf_k', k_vect, 'knowledge'),('tfidf_s', s_vect, 'seeker'),('tfidf_r', r_vect, 'response')], remainder='passthrough')
pipe = Pipeline([('tfidf', c_transform),('classify', modelNb)])
pipe.fit(X_train,y_train)
res = pipe.predict(X_test)

In [None]:
report = classification_report(y_test , res, target_names=["Entailment","Generic","Hallucination","Uncooperative"])
cnf_matrix = skm.multilabel_confusion_matrix(y_test, res)
print(skm.classification_report(y_test,res))

In [None]:
modelSVM = BinaryRelevance(LinearSVC(random_state=42))
k_vect = TfidfVectorizer()
s_vect = TfidfVectorizer()
r_vect = TfidfVectorizer()
c_transform = ColumnTransformer([('tfidf_k', k_vect, 'knowledge'),('tfidf_s', s_vect, 'seeker'),('tfidf_r', r_vect, 'response')], remainder='passthrough')
pipe_1 = Pipeline([('tfidf', c_transform),('classify', modelSVM)])
pipe_1.fit(X_train,y_train)
res_1 = pipe_1.predict(X_test)

In [None]:
report = classification_report(y_test , res, target_names=["Entailment","Generic","Hallucination","Uncooperative"])
cnf_matrix = skm.multilabel_confusion_matrix(y_test, res_1)
print(skm.classification_report(y_test,res_1))

# VRM Classifier

In [None]:
X_train = final_train[["knowledge", "seeker", "response"]]
X_test = final_test[["knowledge", "seeker", "response"]]
y_train = final_train[["Ack.","Advisement","Disclosure","Edification","Question"]]
y_test = final_test[["Ack.","Advisement","Disclosure","Edification","Question"]]

In [None]:
#Pipeline
modelNb = BinaryRelevance(MultinomialNB())
k_vect = TfidfVectorizer()
s_vect = TfidfVectorizer()
r_vect = TfidfVectorizer()
c_transform = ColumnTransformer([('tfidf_k', k_vect, 'knowledge'),('tfidf_s', s_vect, 'seeker'),('tfidf_r', r_vect, 'response')], remainder='passthrough')
pipe = Pipeline([('tfidf', c_transform),('classify', modelNb)])
pipe.fit(X_train,y_train)
res = pipe.predict(X_test)

In [None]:
report = classification_report(y_test , res, target_names=["Ack.","Advisement","Disclosure","Edification","Question"])
cnf_matrix = skm.multilabel_confusion_matrix(y_test, res)
print(skm.classification_report(y_test,res))

In [None]:
modelSVM = BinaryRelevance(LinearSVC(random_state=42))
k_vect = TfidfVectorizer()
s_vect = TfidfVectorizer()
r_vect = TfidfVectorizer()
c_transform = ColumnTransformer([('tfidf_k', k_vect, 'knowledge'),('tfidf_s', s_vect, 'seeker'),('tfidf_r', r_vect, 'response')], remainder='passthrough')
pipe_1 = Pipeline([('tfidf', c_transform),('classify', modelSVM)])
pipe_1.fit(X_train,y_train)
res_1 = pipe_1.predict(X_test)

In [None]:
report = classification_report(y_test , res, target_names=["Ack.","Advisement","Disclosure","Edification","Question"])
cnf_matrix = skm.multilabel_confusion_matrix(y_test, res_1)
print(skm.classification_report(y_test,res_1))