In [1]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, Doc2Vec
from nltk.tokenize import word_tokenize
import nltk

from sklearn.metrics.pairwise import cosine_similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

fallacy_model = AutoModelForSequenceClassification.from_pretrained("q3fer/distilbert-base-fallacy-classification")
tokenizer = AutoTokenizer.from_pretrained("q3fer/distilbert-base-fallacy-classification")


def get_fallacy_score(text):
    inputs = tokenizer(text, return_tensors='pt')

    with torch.no_grad():
      logits = fallacy_model(**inputs)
      scores = logits[0][0]
      scores = torch.nn.Softmax(dim=0)(scores)

      _, ranking = torch.topk(scores, k=scores.shape[0])
      ranking = ranking.tolist()
    
    
    if float(max(scores)) < 0.5:
        return 0
    else: 
        score = (float(max(scores)) - 0.5) / 0.5
    return score

In [3]:
def sentence_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return None
    
def calculate_cosine_similarity(vec1, vec2):
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

def perform_sentiment_analysis(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores

def normalize(list_):
    return [(e+1)/2 for e in list_]
    

In [4]:
df = pd.read_csv('QA.csv')
df.dropna(inplace=True)
corpus = list(df['instruction']) + list(df['response'])
tokenized = [simple_preprocess(sentence) for sentence in corpus]

In [5]:
model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=1, workers=4)

In [6]:
vectorized = [sentence_vector(sentence, model) for sentence in corpus]

In [7]:
questions = vectorized[:int(len(vectorized)/2)]
answers = vectorized[int(len(vectorized)/2):]
corpus_a = corpus[int(len(corpus)/2):]

In [8]:
for i in range(len(answers)-1, 0, -1):
    if answers[i] is None:
        del answers[i]
        del questions[i]
        del corpus_a[i]
        df = df.drop(i)
df.reset_index(inplace=True, drop=True)

In [9]:
sim = []
sent = []
fall = []

for q, a, a_og in zip(questions, answers, corpus_a):
    s = calculate_cosine_similarity(q, a)
    sim.append(s)
    
    st = perform_sentiment_analysis(a_og)
    sent.append(st['compound'])
    
    try:
        f = get_fallacy_score(a_og)
    except:
        f = 999
    fall.append(1-f)
    


Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


In [10]:
df['Relation'] = normalize(sim)
df['Manner'] = normalize(sent)
df['Quality'] = fall

In [13]:
df[df['Quality']==-998]
#responses are too long and it crashed!

Unnamed: 0,instruction,response,Relation,Manner,Quality
67,What is the FA Cup?,"The Football Association Challenge Cup, more c...",0.997826,0.99355,-998.0
1232,What is the successor liability and can you gi...,"In law, successor liability is liability (debt...",0.997938,0.35975,-998.0
1531,What are the key steps for building a an in gr...,Pre-construction site meeting - Before startin...,0.996525,0.9809,-998.0
1533,What is genome analysis?,The genome sequence of an organism includes th...,0.996873,0.7895,-998.0
1968,Name some famous poets and their poems.,Edgar Allen Poe\nBirthplace: Boston\nFamous po...,0.999073,0.99955,-998.0
2127,What is the plot of the movie John Wick?,"John Wick is grieving the death of his wife, H...",0.991997,0.0012,-998.0
2136,Who is the greatest artist of all time?,Michael Jackson\nMichael Jackson (Michael Jose...,0.99759,0.99685,-998.0
2537,What is the history of the Masters Golf Tourna...,The Masters was started by amateur champion Bo...,0.994003,0.9987,-998.0
2647,What is a mitochondrion?,A mitochondrion is an organelle found in the c...,0.994363,0.9287,-998.0
3023,What are the advantages and disadvantages of l...,Java and Python are two of the most popular pr...,0.992072,0.9997,-998.0


In [26]:
print ("Responses without fallacies:", len(df[df['Quality']==1]))
print ("Responses with fallacies:", len(df[(df['Quality']!=1) & (df['Quality']!=-998)]))

Responses without fallacies: 1820
Responses with fallacies: 1868


In [27]:
df[(df['Quality']!=1) & (df['Quality']!=-998)]

Unnamed: 0,instruction,response,Relation,Manner,Quality
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice,0.996900,0.50000,0.608241
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...,0.995580,0.53860,0.524350
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na...",0.999511,0.67000,0.988467
7,What happens when the sun goes down?,"When the sun sets, the evening starts.",0.997888,0.50000,0.400300
8,What is a verb?,A verb is an action word that describes an act...,0.989093,0.54285,0.379987
...,...,...,...,...,...
3693,What seven houses ruled the Seven Kingdoms bef...,"Stark, Tully, Arryn, Lannister, Tyrell, Barath...",0.994542,0.50000,0.298511
3697,Who is the author of Harry Potter?,J.K. Rowling is the British author of the seve...,0.999797,0.50000,0.715441
3699,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...,0.996649,0.16475,0.734557
3702,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...,0.999793,0.50000,0.423567


In [28]:
df[df['Relation'] < 0.5]
#Single-words responses are more or less seen as neutral

Unnamed: 0,instruction,response,Relation,Manner,Quality
13,Who saved Andromeda from the sea monster,Perseus,0.39646,0.5,1.0
103,What dog breed is a mix between a Cocker Spani...,Cockapoo,0.49049,0.5,1.0
224,What is Bart Simpson's best friend named?,Millhouse,0.456857,0.5,0.952592
359,What kind of dog breed do you get when you mat...,sheepadoodle,0.465489,0.5,1.0
360,Which country is alphabetically last,Zimbabwe,0.47522,0.5,0.994472
565,What are the India’s Foreign Exchange Reserves...,$545.65 bn,0.490747,0.5,0.496157
613,Kaka means parrot in which language,Maori,0.429675,0.5,1.0
881,Which African country was founded by Americans,Liberia,0.438918,0.5,1.0
1258,Marburg Virus Disease belongs to the same fami...,Ebola,0.405654,0.5,0.794926
1270,Which author wrote the ‘Winnie-the-Pooh’ books?,A. A. Milne,0.429594,0.5,1.0
