In [None]:
import re
from bs4 import BeautifulSoup

def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q
    

In [None]:
! pip install transformers -q
! pip install sentencepiece -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-MiniLM2-L6-H768")

# classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")

#!!! !!!!!!!classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-deberta-v3-large")

# classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-deberta-v3-base")

#!!! classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-deberta-v3-small")

#!!!! classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1")

# classifier = pipeline("zero-shot-classification", model="typeform/roberta-large-mnli")

# classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-9")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
import pandas as pd

df = pd.read_excel("/content/train.xlsx")

df.reason = df.reason.apply(preprocess)
df.text = df.text.apply(preprocess)
df.label = df.label.apply(lambda x:int(x))


df2 = pd.read_excel("/content/evaluation.xlsx")

df2.reason = df2.reason.apply(preprocess)
df2.text = df2.text.apply(preprocess)
df2.label = df2.label.apply(lambda x:int(x))

In [None]:
df

In [None]:
import time
import matplotlib.pyplot as plt

In [None]:
res = []
st = time.time()
for i in range(2061) :
  print(i)
  sentence = df['text'].iloc[i]
  personal_attributes = df['reason'].iloc[i]
  res.append(classifier(sentence, personal_attributes)['scores'][0])
time.time() - st

In [None]:
interval = []
for i in range(0,100,5) :
  interval.append((i/100, (i+5)/100))

In [None]:
def group_values(values, intervals) :
  ma = dict()
  for i in values :
    for j in intervals :
      if j[0] < i <= j[1] :
        ma[j] = ma.get(j,0) + 1
        break
  return ma

In [None]:
ma = group_values(res, interval)

In [None]:
upd = {}
for key in ma :
  upd[int(key[1]*100)] = ma[key]

In [None]:
upd

In [None]:
sum(res)/len(res)

In [None]:
fig = plt.figure(figsize = (5,5))
 
# creating the bar plot
plt.bar(list(upd.keys()), list(upd.values()), color ='red', width=4,)

plt.xlabel('Similarity percentage')
plt.ylabel('No of samples')

In [None]:
result = []
st = time.time()
for i in range(len(df2)) :
  print(i)
  sentence = df2['text'].iloc[i]
  personal_attributes = df2['reason'].iloc[i]
  result.append(classifier(sentence, personal_attributes)['scores'][0])
time.time() - st

In [None]:
threshold = 0.8

In [None]:
for i in range(len(result)) : 
  if result[i] >= threshold : 
    result[i] = 1
  else :
    result[i] = 0

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score

In [None]:
yy = df2['label']
yp = result
print('accuracy  : ',accuracy_score(yy,yp))
print('recall    : ',recall_score(yy,yp))
print('precision : ',precision_score(yy,yp))
print('f1 score  : ',f1_score(yy,yp))
print('roc accur : ',roc_auc_score(yy,yp))
print('confusion : \n', confusion_matrix(yy, yp))
print('\n')