In [1]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, Doc2Vec
from nltk.tokenize import word_tokenize
import nltk

from sklearn.metrics.pairwise import cosine_similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

fallacy_model = AutoModelForSequenceClassification.from_pretrained("q3fer/distilbert-base-fallacy-classification")
tokenizer = AutoTokenizer.from_pretrained("q3fer/distilbert-base-fallacy-classification")


def get_fallacy_score(text):
    inputs = tokenizer(text, return_tensors='pt')

    with torch.no_grad():
      logits = fallacy_model(**inputs)
      scores = logits[0][0]
      scores = torch.nn.Softmax(dim=0)(scores)

      _, ranking = torch.topk(scores, k=scores.shape[0])
      ranking = ranking.tolist()
    
    
    if float(max(scores)) < 0.5:
        return 0
    else: 
        score = (float(max(scores)) - 0.5) / 0.5
    return score

In [11]:
def sentence_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv] #vectors of words

    if vectors: #sentence vector ==> average of words vectors of that sentence
        return sum(vectors) / len(vectors)
    else:
        return None
    
def calculate_cosine_similarity(vec1, vec2):
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

def perform_sentiment_analysis(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores

def normalize(list_):
    return [(e+1)/2 for e in list_]
    

In [12]:
df = pd.read_csv('QA.csv')
df.dropna(inplace=True)
corpus = list(df['instruction']) + list(df['response'])
tokenized = [simple_preprocess(sentence) for sentence in corpus]

In [13]:
model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=1, workers=4)

In [9]:
vectorized = [sentence_vector(sentence, model) for sentence in corpus]

In [17]:
len(vectorized[0])

8

In [18]:
corpus[0]

'Why can camels survive for long without water?'

In [7]:
questions = vectorized[:int(len(vectorized)/2)]
answers = vectorized[int(len(vectorized)/2):]
corpus_a = corpus[int(len(corpus)/2):]

In [8]:
for i in range(len(answers)-1, 0, -1):
    if answers[i] is None:
        del answers[i]
        del questions[i]
        del corpus_a[i]
        df = df.drop(i)
df.reset_index(inplace=True, drop=True)

In [9]:
sim = []
sent = []
fall = []

for q, a, a_og in zip(questions, answers, corpus_a):
    s = calculate_cosine_similarity(q, a)
    sim.append(s)
    
    st = perform_sentiment_analysis(a_og)
    sent.append(st['compound'])
    
    try:
        f = get_fallacy_score(a_og)
    except:
        f = 999
    fall.append(1-f)
    


Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


In [29]:
df['Relation'] = normalize(sim)
df['Manner'] = normalize(sent)
df['Quality'] = fall
df

Unnamed: 0,instruction,response,Relation,Manner,Quality
0,Why can camels survive for long without water?,Camels use the fat in their humps to keep them...,0.991368,0.63660,1.000000
1,"Alice's parents have three daughters: Amy, Jes...",The name of the third daughter is Alice,0.996900,0.50000,0.608241
2,Who gave the UN the land in NY to build their HQ,John D Rockerfeller,0.998289,0.50000,1.000000
3,What is a polygon?,A polygon is a form in Geometry. It is a sing...,0.995580,0.53860,0.524350
4,Which episodes of season four of Game of Thron...,"She directed ""Oathkeeper"" and ""First of His Na...",0.999511,0.67000,0.988467
...,...,...,...,...,...
3699,"What does troll the respawn, Jeremy mean?",Trolling the respawn refers to when in video g...,0.996649,0.16475,0.734557
3700,Who won the World Chess Championship in 2021?,Magnus Carlsen defeated Ian Nepomniachtchi 7.5...,0.995049,0.60115,1.000000
3701,Why is it a good idea to walk every day?,Walking is a good exercise for burning calorie...,0.999820,0.94070,1.000000
3702,Who is the creator of Python?,Guido van Rossum is the father of Python. And ...,0.999793,0.50000,0.423567


In [44]:
df[df['Relation'] < 0.5]
#Single-words responses are more or less seen as neutral

Unnamed: 0,instruction,response,Relation,Manner,Quality
13,Who saved Andromeda from the sea monster,Perseus,0.39646,0.5,1.0
103,What dog breed is a mix between a Cocker Spani...,Cockapoo,0.49049,0.5,1.0
224,What is Bart Simpson's best friend named?,Millhouse,0.456857,0.5,0.952592
359,What kind of dog breed do you get when you mat...,sheepadoodle,0.465489,0.5,1.0
360,Which country is alphabetically last,Zimbabwe,0.47522,0.5,0.994472
565,What are the India’s Foreign Exchange Reserves...,$545.65 bn,0.490747,0.5,0.496157
613,Kaka means parrot in which language,Maori,0.429675,0.5,1.0
881,Which African country was founded by Americans,Liberia,0.438918,0.5,1.0
1258,Marburg Virus Disease belongs to the same fami...,Ebola,0.405654,0.5,0.794926
1270,Which author wrote the ‘Winnie-the-Pooh’ books?,A. A. Milne,0.429594,0.5,1.0


In [13]:
df[df['Quality']==-998]
#responses are too long and it crashed!

Unnamed: 0,instruction,response,Relation,Manner,Quality
67,What is the FA Cup?,"The Football Association Challenge Cup, more c...",0.997826,0.99355,-998.0
1232,What is the successor liability and can you gi...,"In law, successor liability is liability (debt...",0.997938,0.35975,-998.0
1531,What are the key steps for building a an in gr...,Pre-construction site meeting - Before startin...,0.996525,0.9809,-998.0
1533,What is genome analysis?,The genome sequence of an organism includes th...,0.996873,0.7895,-998.0
1968,Name some famous poets and their poems.,Edgar Allen Poe\nBirthplace: Boston\nFamous po...,0.999073,0.99955,-998.0
2127,What is the plot of the movie John Wick?,"John Wick is grieving the death of his wife, H...",0.991997,0.0012,-998.0
2136,Who is the greatest artist of all time?,Michael Jackson\nMichael Jackson (Michael Jose...,0.99759,0.99685,-998.0
2537,What is the history of the Masters Golf Tourna...,The Masters was started by amateur champion Bo...,0.994003,0.9987,-998.0
2647,What is a mitochondrion?,A mitochondrion is an organelle found in the c...,0.994363,0.9287,-998.0
3023,What are the advantages and disadvantages of l...,Java and Python are two of the most popular pr...,0.992072,0.9997,-998.0


In [26]:
print ("Responses without fallacies:", len(df[df['Quality']==1]))
print ("Responses with fallacies:", len(df[(df['Quality']!=1) & (df['Quality']!=-998)]))

Responses without fallacies: 1820
Responses with fallacies: 1868


In [43]:
df[(df['Quality']!=1) & (df['Quality']!=-998)].sort_values(by=['Quality'])[:10]

Unnamed: 0,instruction,response,Relation,Manner,Quality
2447,What coloring of cat is almost always female?,Calico cats are almost always female.,0.986115,0.5,0.02371
2809,Why are dogs better than cats for a person's p...,Dogs keep their owners more active by needing ...,0.999736,0.8658,0.033793
1318,What's the easiest way to make friends when tr...,Traveling alone to anywhere in the world can b...,0.997967,0.99825,0.03416
159,"What family do tigers, lions and panthers all ...","Tigers, lions and panthers are all a part of t...",0.990352,0.5,0.038841
1220,International Booker Prize 2021 was given to,At night all blood is black,0.998662,0.5,0.040264
1700,Pultizer Prize 2022 is given which field,"Journalism, Books, Drama and Music",0.998981,0.5,0.041935
3092,What coloring of cat is almost always male?,Orange cats are generally male.,0.98628,0.5,0.042171
1768,Do Harry Potter and Hermione get married?,No. Hermione Granger ends up marrying Ron Weas...,0.995478,0.5,0.043267
701,Why is the World Cup the best sporting competi...,The World Cup is a global event that happens o...,0.997173,0.8404,0.04634
2378,How do you ride a bike?,You can learn to ride a bike by starting with ...,0.985265,0.91125,0.04859


In [None]:
# sent2vec
# rivedere modelli
# 

# Experiments with NER

In [78]:
a, b, c, d, e, f = str("a"), str("b"), str("c"), str("d"), str("e"), str("f")
g, h, i = 1, 1, 1

In [90]:
words = [a, b, c, d, e, f]
stops = [g, h, i]
NER = [a, f]

#weight = len(words) / len(NER) #NO! Se non ci sono NER è impossibile
boost1 = 1 - (len(NER) / len(words))
boost2 = (len(NER) / len(words))

In [91]:
#boost normale
real_w = 0
stop_w = len(stops)

for word in words:
    if word in NER:
        real_w += 1 + boost1
    else:
        real_w += 1

real_w / (real_w + stop_w)

0.7096774193548387

In [92]:
#boost ???
real_w = 0
stop_w = len(stops)

for word in words:
    if word in NER:
        real_w += 1 + boost2
    else:
        real_w += 1

real_w / (real_w + stop_w)

0.689655172413793