In [1]:
import pandas as pd
import numpy as np

In [6]:
# dataframe from article retrieval
df = pd.read_csv('questions_ranking.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
print('original dataframe')
display(df.head())

original dataframe


Unnamed: 0,question_id,question,rank,article_id,article,score
0,0,"Who is the author of the memoir ""Nicotine""?",1,17552,"The best cigarette you will ever smoke, Gregor...",0.085027
1,0,"Who is the author of the memoir ""Nicotine""?",2,17689,American novelists have long complained about ...,0.001403
2,0,"Who is the author of the memoir ""Nicotine""?",3,17552,while smoking and dreamt of cigarettes of rela...,0.000924
3,1,What skills are inmates learning in the innova...,1,17382,"LECCE, Italy One of his first students was a y...",0.985759
4,1,What skills are inmates learning in the innova...,2,17382,"Do remember, even if you had Trump over for di...",0.962276


# Load the Model

In [4]:
from transformers import pipeline
# Replace this with your own checkpoint

model_checkpoint = "huggingface-course/bert-finetuned-squad"
question_answerer = pipeline("question-answering", model=model_checkpoint)

In [7]:
snippet = []

for idx, row in df.iterrows():
    
    snippet.append(question_answerer(question=row['question'], context=row['article']))
    
score = []
answer = []
start_idx = []
end_idx = []
for i in snippet:
    score.append(i['score'])
    answer.append(i['answer'])
    start_idx.append(i['start'])
    end_idx.append(i['end'])

In [8]:
df['snippet'] = snippet
df['score'] = score
df['answer'] = answer
df['start_idx'] = start_idx
df['end_idx'] = end_idx

In [9]:
def get_sorounding_words(article, start_pos, end_pos, num_words=5):
    s_pos = len(article[:start_pos].split())
    e_pos = len(article[:end_pos].split())
    
    return ' '.join(article.split()[max(0,s_pos-num_words):e_pos+num_words])

In [10]:
longer_answer = []
for idx, row in df.iterrows():
    longer_answer.append(get_sorounding_words(row['article'], row['start_idx'], row['end_idx'], 5))
    
df['longer_answer'] = longer_answer

In [11]:
df.head()

Unnamed: 0,question_id,question,rank,article_id,article,score,snippet,answer,start_idx,end_idx,longer_answer
0,0,"Who is the author of the memoir ""Nicotine""?",1,17552,"The best cigarette you will ever smoke, Gregor...",0.990425,"{'score': 0.9904252290725708, 'start': 40, 'en...",Gregor Hens,40,51,"cigarette you will ever smoke, Gregor Hens wri..."
1,0,"Who is the author of the memoir ""Nicotine""?",2,17689,American novelists have long complained about ...,0.467833,"{'score': 0.4678330421447754, 'start': 325, 'e...",Charles Van Doren,325,342,"The figure Roth cites is Charles Van Doren, of..."
2,0,"Who is the author of the memoir ""Nicotine""?",3,17552,while smoking and dreamt of cigarettes of rela...,0.051056,"{'score': 0.05105575546622276, 'start': 2123, ...",Karl Ove Knausgaard,2123,2142,mind of the Norwegian writer Karl Ove Knausgaa...
3,1,What skills are inmates learning in the innova...,1,17382,"LECCE, Italy One of his first students was a y...",0.312302,"{'score': 0.3123021423816681, 'start': 780, 'e...",professional skills,780,799,program to teach prisoners new professional sk...
4,1,What skills are inmates learning in the innova...,2,17382,"Do remember, even if you had Trump over for di...",0.722732,"{'score': 0.7227321863174438, 'start': 1016, '...",waiters and cooks,1016,1033,near Milan in which the waiters and cooks are ...


snippet from article chunk

# Evaluation

In [12]:
import re

def clean_text(text):
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'[^\w\s]', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+',' ', cleaned_text)
    return cleaned_text.strip()


In [13]:
df['answer_cleaned'] = df.answer.apply(lambda x: clean_text(x))

In [14]:
df_labeled = pd.read_excel('test_qa.xlsx')
df_labeled.reset_index(inplace=True)
df_labeled.rename(columns={'index':'question_id'},inplace=True)
df_labeled = df_labeled.loc[df_labeled.answer != '-']
df_labeled.head()

Unnamed: 0,question_id,id,topic,article,question,answer,paragraph
0,0,17552,lifestyle,"The best cigarette you will ever smoke, Gregor...","Who is the author of the memoir ""Nicotine""?",Gregor Hens.,1.0
1,1,17382,business,"LECCE, Italy One of his first students was a y...",What skills are inmates learning in the innova...,"Inmates are learning how to taste, choose, and...",3.0
2,2,17547,entertainment,A man who admitted changing the Hollywood sign...,Who surrendered to the authorities for changin...,Zachary Cole Fernandez surrendered to the auth...,1.0
3,3,17778,entertainment,A magician walks into a laboratory. It's not t...,What is the name of the journalist and archivi...,Mike Best is the journalist and archivist who ...,5.0
4,4,17841,politics,WASHINGTON A spate of violence erupted on Frid...,Which leader of the fringe movement embracing ...,"Richard B. Spencer, a leader of the fringe mov...",1.0


In [15]:
from sentence_transformers import SentenceTransformer

vectorizer = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [16]:
df_labeled['answer_cleaned'] = df_labeled.answer.apply(lambda x: clean_text(x))
df_labeled['answer_vector'] = df_labeled['answer_cleaned'].apply(lambda x: vectorizer.encode([x]))
df_labeled.head()

Unnamed: 0,question_id,id,topic,article,question,answer,paragraph,answer_cleaned,answer_vector
0,0,17552,lifestyle,"The best cigarette you will ever smoke, Gregor...","Who is the author of the memoir ""Nicotine""?",Gregor Hens.,1.0,gregor hens,"[[-0.5458462, 0.28288853, 0.20445457, -0.33087..."
1,1,17382,business,"LECCE, Italy One of his first students was a y...",What skills are inmates learning in the innova...,"Inmates are learning how to taste, choose, and...",3.0,inmates are learning how to taste choose and s...,"[[-0.06997983, 0.50586116, -0.070998415, 0.007..."
2,2,17547,entertainment,A man who admitted changing the Hollywood sign...,Who surrendered to the authorities for changin...,Zachary Cole Fernandez surrendered to the auth...,1.0,zachary cole fernandez surrendered to the auth...,"[[0.064143814, -0.1929121, -0.14826077, 0.1730..."
3,3,17778,entertainment,A magician walks into a laboratory. It's not t...,What is the name of the journalist and archivi...,Mike Best is the journalist and archivist who ...,5.0,mike best is the journalist and archivist who ...,"[[0.026869029, -0.22804636, -0.52153367, -0.36..."
4,4,17841,politics,WASHINGTON A spate of violence erupted on Frid...,Which leader of the fringe movement embracing ...,"Richard B. Spencer, a leader of the fringe mov...",1.0,richard b spencer a leader of the fringe movem...,"[[0.06477255, -0.020493872, -0.07857357, -0.53..."


In [17]:
df_r1 = df.loc[df['rank'] == 1].copy()
df_r1['answer_vector'] = df_r1['answer_cleaned'].apply(lambda x: vectorizer.encode([x]))
df_r1.head()

Unnamed: 0,question_id,question,rank,article_id,article,score,snippet,answer,start_idx,end_idx,longer_answer,answer_cleaned,answer_vector
0,0,"Who is the author of the memoir ""Nicotine""?",1,17552,"The best cigarette you will ever smoke, Gregor...",0.990425,"{'score': 0.9904252290725708, 'start': 40, 'en...",Gregor Hens,40,51,"cigarette you will ever smoke, Gregor Hens wri...",gregor hens,"[[-0.5458462, 0.28288853, 0.20445457, -0.33087..."
3,1,What skills are inmates learning in the innova...,1,17382,"LECCE, Italy One of his first students was a y...",0.312302,"{'score': 0.3123021423816681, 'start': 780, 'e...",professional skills,780,799,program to teach prisoners new professional sk...,professional skills,"[[-0.0424771, 0.30293548, -0.2807706, -0.27530..."
6,2,Who surrendered to the authorities for changin...,1,17547,A man who admitted changing the Hollywood sign...,0.999227,"{'score': 0.9992271661758423, 'start': 103, 'e...",Zachary Cole Fernandez,103,125,himself in to the authorities. Zachary Cole Fe...,zachary cole fernandez,"[[0.13296245, -0.3511932, -0.28270483, 0.43179..."
9,3,What is the name of the journalist and archivi...,1,17778,"the agency (the magician, who someone claimed ...",0.978693,"{'score': 0.9786927103996277, 'start': 1450, '...",Heather Fritz Horniak,1450,1471,"Crest documents more quickly, said Heather Fri...",heather fritz horniak,"[[-0.4223661, 0.105865315, 0.0115351975, -0.18..."
12,4,Which leader of the fringe movement embracing ...,1,17841,WASHINGTON A spate of violence erupted on Frid...,0.991884,"{'score': 0.9918835163116455, 'start': 833, 'e...",Richard B. Spencer,833,851,"not only on the police. Richard B. Spencer, a ...",richard b spencer,"[[0.024850488, -0.36256808, 0.07600315, -0.217..."


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz

pd.options.display.max_colwidth = 20
merged_df = pd.merge(df_r1[['question', 'answer_cleaned', 'answer_vector']], df_labeled[['question', 'answer_cleaned', 'answer_vector']], on='question', suffixes=('_pred', '_target'))

cos_sim = []
fuzwuz_partial = []
max_score = []
for idx, row in merged_df.iterrows():
    cos_sim.append(cosine_similarity(row['answer_vector_target'], row['answer_vector_pred'])[0,0] * 100)
    fuzwuz_partial.append(fuzz.partial_ratio(row['answer_cleaned_target'], row['answer_cleaned_pred']))
    max_score.append(max(cos_sim[idx], fuzwuz_partial[idx]))

merged_df['cosine_similarity'] = cos_sim
merged_df['fuzzywuzzy_partial'] = fuzwuz_partial
merged_df['max_score'] = max_score

# Display the updated DataFrame
merged_df.head()




Unnamed: 0,question,answer_cleaned_pred,answer_vector_pred,answer_cleaned_target,answer_vector_target,cosine_similarity,fuzzywuzzy_partial,max_score
0,Who is the autho...,gregor hens,"[[-0.5458462, 0....",gregor hens,"[[-0.5458462, 0....",100.0,100,100.0
1,What skills are ...,professional skills,"[[-0.0424771, 0....",inmates are lear...,"[[-0.06997983, 0...",22.998114,32,32.0
2,Who surrendered ...,zachary cole fer...,"[[0.13296245, -0...",zachary cole fer...,"[[0.064143814, -...",64.075583,100,100.0
3,What is the name...,heather fritz ho...,"[[-0.4223661, 0....",mike best is the...,"[[0.026869029, -...",35.780039,43,43.0
4,Which leader of ...,richard b spencer,"[[0.024850488, -...",richard b spence...,"[[0.06477255, -0...",72.393411,100,100.0


In [19]:
pd.options.display.max_colwidth = 200

merged_df[['question', 'answer_cleaned_target','answer_cleaned_pred','cosine_similarity','fuzzywuzzy_partial','max_score']]

Unnamed: 0,question,answer_cleaned_target,answer_cleaned_pred,cosine_similarity,fuzzywuzzy_partial,max_score
0,"Who is the author of the memoir ""Nicotine""?",gregor hens,gregor hens,100.0,100,100.0
1,What skills are inmates learning in the innovative program at Lecce Penitentiary?,inmates are learning how to taste choose and serve local wines,professional skills,22.998114,32,32.0
2,"Who surrendered to the authorities for changing the Hollywood sign to ""Hollyweed""?",zachary cole fernandez surrendered to the authorities for changing the hollywood sign,zachary cole fernandez,64.075583,100,100.0
3,What is the name of the journalist and archivist who advocated for the online publication of the C.I.A. files?,mike best is the journalist and archivist who advocated for the online publication of the c i a files,heather fritz horniak,35.780039,43,43.0
4,Which leader of the fringe movement embracing white nationalism was punched in the face during the protests in Washington?,richard b spencer a leader of the fringe movement that embraces white nationalism and a range of racist and positions was punched in the face by a protester as mr spencer gave an interview on the ...,richard b spencer,72.393411,100,100.0
5,"Who narrated the documentary ""I Am Not Your Negro""?",the narration read by samuel l jackson is entirely drawn from baldwin s work,samuel l jackson,62.003821,100,100.0
6,Who is the billionaire restaurant owner nominated to head the Labor Department?,mr puzder the billionaire restaurant owner is nominated to head the labor department,andrew f puzder,61.940116,53,61.940116
7,Who is the federal judge that ordered President Trump's golf resort to pay $5.7 million for refusing to refund deposits to members?,the federal judge who ordered president trump s golf resort to pay 5 7 million for refusing to refund deposits to members is judge kenneth marra of federal district court in west palm beach,kenneth marra,42.437929,100,100.0
8,What business relationship between Donald Trump and Elon Musk surprised many people?,the strategic relationship between tesla leadership and the new administration surprised many people,trump would be relying on musk as an adviser,48.788184,39,48.788184
9,Who vowed to take executive action on a nearly daily basis to unravel his predecessor's legacy and begin enacting his own agenda?,president trump,president trump,99.999982,100,100.0


In [20]:
merged_df.describe()

Unnamed: 0,cosine_similarity,fuzzywuzzy_partial,max_score
count,11.0,11.0,11.0
mean,64.583379,78.818182,80.520755
std,26.714753,29.778577,27.875354
min,22.998114,32.0,32.0
25%,45.613056,48.0,55.36415
50%,62.003821,100.0,100.0
75%,86.196697,100.0,100.0
max,100.0,100.0,100.0
