In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Packages and Data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
%matplotlib inline
import random 
random.seed(1)

# Installing latest version of gensim for latest BM-25 module
!pip install gensim
!pip install --upgrade gensim  #need to update package to 3.8.3 (or else it is outdated)
!pip install "gensim==3.8.1"

from gensim.summarization import bm25

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/c3/dd/5e00b6e788a9c522b48f9df10472b2017102ffa65b10bc657471e0713542/gensim-4.0.0-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 1.3MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0
Collecting gensim==3.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/44/93/c6011037f24e3106d13f3be55297bf84ece2bf15b278cc4776339dc52db5/gensim-3.8.1-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 9.9MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 4.0.0
    Uninstalling gensim-4.0.0:
      Successfully uninstalled gensim-4.0.0
Successfully installed gensim-3.8.1


In [3]:
df = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/W266 Final Project/data/data_for_tfidf.csv')
df = df.fillna("")

print(df.shape)
df.head(5)

(5980, 10)


Unnamed: 0,question_id,question_title_body,question_title_body_text,question_title_raw,question_body,question_body_text,answer_id,answer_body,answer_text,is_accepted_answer
0,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55662134,There can be different approaches to the probl...,There can be different approaches to the probl...,0
1,55664544,so I have a list of dicts that looks like this...,so I have a list of dicts that looks like this...,Python - Inserting and Updating python dict si...,so I have a list of dicts that looks like this...,so I have a list of dicts that looks like this...,55664726,"Only using dict, list, and set:\n[ \n { \n ...","Only using dict, list, and set:\n",0
2,55644043,I am trying to understand how to access a zip ...,I am trying to understand how to access a zip ...,Accessing the value of a zip object using its ...,I am trying to understand how to access a zip ...,I am trying to understand how to access a zip ...,55665839,As the others say: zip returns a zip object wh...,As the others say: zip returns a zip object wh...,0
3,55668648,I need to find the starting index of the speci...,I need to find the starting index of the speci...,Find the specific sequence of words in list,I need to find the starting index of the speci...,I need to find the starting index of the speci...,55669044,You can do something like:\ndef find_sequence(...,You can do something like:\n\n\nOutput:\n\n\n1...,0
4,55707618,I have to read data from a csv file and I want...,I have to read data from a csv file and I want...,Working with csv files that have delimiters,I have to read data from a csv file and I want...,I have to read data from a csv file and I want...,55708016,You can use the delimiter parameter when readi...,You can use the delimiter parameter when readi...,1


In [4]:
print(df.shape)
df.columns

(5980, 10)


Index(['question_id', 'question_title_body', 'question_title_body_text',
       'question_title_raw', 'question_body', 'question_body_text',
       'answer_id', 'answer_body', 'answer_text', 'is_accepted_answer'],
      dtype='object')

## Formatting Data for Modeling (TEXT only - question_title & answer)
For each unique question, we want X number of answers to calculate BM-25 scores for.   
Highest BM-25 score means that the answer is most similar to the question.  

In [5]:
""" 
Data Structure

A dictionary where:
key = tuple (question_id, question_title_body_text)
value = tuple (list of answer_body, list of is_accepted_answer 1 or 0)

"""
start=time.time()

unique_question_ids = np.array(df['question_id'].unique())   #array of unique question_ids
data = dict()                                                #initialize dictionary for data structure
questions=len(unique_question_ids)                           #initialize how many questions to run script for

for q_id in unique_question_ids[0:questions]:

  answer_list = []  #list of answer_texts
  accepted_answer = [] #list of whether answer was accepted or not (1's and 0's)

  #Add answers for each question into answer_list and accepted_answer list
  #Use if condition to only append if it has not already been appended (avoids duplicate answer_text)
  for each in df[df.question_id == q_id].index:
    if df.answer_text[each] not in answer_list:
      answer_list.append(df.answer_text[each])
      accepted_answer.append(df.is_accepted_answer[each])

  #Fill in additional random answers per question (if there are less than X answers)
  X=10
  #if size of answer_list is less than X, then add random answer
  while len(answer_list) < X:
    add_answer = random.sample(range(0,df.shape[0]), X-len(answer_list))  #randomly pick answer from dataset
    for add in add_answer:
      answer_list.append(df.answer_text[add])
  #also remember to add 0 b/c these answers are not the accepted answer for that question_id
  while len(accepted_answer) < X:
    accepted_answer.append(0)

  #Add to dictionary
  question_content = df[df.question_id == q_id].question_title_body_text.tolist()[0]
  data[(q_id, question_content) ] = (answer_list,accepted_answer)

#Print 5 results to check
head=5
count1,count2=0,0
for each in data.keys():
  print(each)
  count1+=1
  if count1==head: break
print("")
for each in data.keys():
  print(data[each])
  count2+=1
  if count2==head: break
print("length of data:", len(data))

end=time.time()
print("Time this took: %0.2f seconds" % (end-start))

(55661929, 'How to display 5 numbers per line from a list?\n\nmy current code is displaying only one line that contains 5 numbers, and the expected should be 5 lines that contains 5 numbers per line')
(55664544, 'so I have a list of dicts that looks like this: \n\nAnd I want an output that looks like this: \n\nThat is, I want every  to be the key and have it append the  and  if it belongs to the same field. \nNote: I want to do this WITHOUT using a for loop (apart from the loop to iterate through the list). I want to use python dict functions like  and  etc.\nAny optimized solutions would be really helpful.')
(55644043, "I am trying to understand how to access a zip object and I'm trying to figure out how to access the value in the zipped object using the index by using the .index() just as we have it before in Python 2.x but it seemed that it does not work in Python3 \nHere is the code\n\nWhen I run the code, I am getting this error: AttributeError: 'zip' object has no attribute 'inde

In [6]:
data[(55661929, 'How to display 5 numbers per line from a list?\n\nmy current code is displaying only one line that contains 5 numbers, and the expected should be 5 lines that contains 5 numbers per line')
]

(['There can be different approaches to the problem:\n\nthis code will give the desired output..',
  '\nUse this function where you need to pass the list and entities to be printed in a single line.\nThe output is:\n',
  'You can make the function a generator by making it yield the sliced lists instead:\n\nThis outputs:\n',
  '',
  "\nShould i always use 'else:' even it is not necessary?\n\nI myself believe that they are not necessary. Returning at the beginning of the function in case of edge cases is something that allows you to skip sometimes lots of indentations caused by s:\n\nlooks better than\n\nright?\nBut it's not only about the looks:\n\nElses like that make you easily confuse the indentation levels. \nThe line width becomes smaller because the indentation takes those few character, you might need to format your code more to fit PEP8.\nSometimes you write the main part of the code first, only to discover the edge cases later. Version control systems such as git would mark all

In [7]:
df[df.question_id==55661929]

Unnamed: 0,question_id,question_title_body,question_title_body_text,question_title_raw,question_body,question_body_text,answer_id,answer_body,answer_text,is_accepted_answer
0,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55662134,There can be different approaches to the probl...,There can be different approaches to the probl...,0
34,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55662053,"x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",\nUse this function where you need to pass the...,0
105,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55661981,You can make the function a generator by makin...,You can make the function a generator by makin...,1
361,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,64717925,"lst = [55, 26, 59, 35, 28, 22, 33, 43, 49, 45,...",,0


# BM-25 Baseline (TEXT only)


In [8]:
#source: https://stackoverflow.com/questions/40966014/how-to-use-gensim-bm25-ranking-in-python
#doc: https://radimrehurek.com/gensim_3.8.3/summarization/bm25.html
# bm25_obj.corpus_size      #number of documents
# bm25_obj.avgdl            #average length of documents    
# bm25_obj.doc_freqs        #dictionary with term frequencies for each document 
# bm25_obj.idf              #inversed documents frequence for whole corpus
# bm25_obj.doc_len          #list of document lengths  
# bm25_obj.get_score()      #(document,index) to get score
# bm25_obj.get_scores()     #get scoress of all documents, e.g. query
# bm25_obj.get_scores_bow() #get BM25 scores given document
# bm25.get_bm25_weights()   #get weights of documents in corpus

from gensim import corpora
from gensim.summarization import bm25

start=time.time()

head=5  #value for how many detailed results to print
head_counter=0  #counter for how many detailed results to print

correct_counter=0
incorrect_counter=0
total_questions=0
mrr_list = []

for key in data.keys():
  
  docs = data[key][0]                        #docs is a list of all documents (answers) for BM-25
  texts = [str(doc).split() for doc in docs] #need to split document into tokens
  query = key[1]                             #query is the question that will be compared against all answers  

  dictionary = corpora.Dictionary(texts)
  corpus = [dictionary.doc2bow(text) for text in texts] #converts texts into tuples of (token number, frequency)
  query_doc = dictionary.doc2bow(query.split())         #converts query into tuples of (token number, frequency)

  bm25_obj = bm25.BM25(corpus)
  scores = bm25_obj.get_scores(query_doc)
  best_docs = sorted(range(len(scores)), key=lambda i: -scores[i])

  #Determine location of accepted answer:
  correct_position = 0
  for num in data[key][1]:
    if num == 0:
      correct_position += 1
    else:
      break

  #Keep Track of correct and incorrect predictions
  if correct_position == best_docs[0]:
    correct_counter +=1
  else:
    incorrect_counter += 1
  total_questions+=1

  #Keep track of Rank of answer for calculating MRR
  rank=1
  for r in best_docs:
    if r != correct_position:
      rank += 1
    else:
      break  
  mrr_list.append(1/rank)

  if head_counter <= head: 
    print("Question_ID:", key[0])
    print("scores:",scores)
    print("best_answer ranked (left = best):", best_docs)
    print("correct document number:", correct_position)
    print("######################################################")
  head_counter+=1


#Printing Accuracy
print("Number correct:", correct_counter)
print("Number incorrect", incorrect_counter)
print("Total:", total_questions)
print("Precision: %0.6f %%" %(correct_counter/total_questions*100))

#Calculating/Printing MRR
mrr = sum(mrr_list)/len(mrr_list)
print(mrr_list)
print("MRR:", mrr)


end=time.time()
print("Time this took: %0.6f seconds" % (end-start))

Question_ID: 55661929
scores: [4.020713617042495, 5.597707998944184, 0.5234200730372428, 0, 1.2200394873480076, 3.6640646398034566, 0, 2.847778017203085, 1.7129979160608786, 2.5322256280522586]
best_answer ranked (left = best): [1, 0, 5, 7, 9, 8, 4, 2, 3, 6]
correct document number: 2
######################################################
Question_ID: 55664544
scores: [1.105872626006716, 5.863925403220679, 0, 5.69037335946634, 2.572168520863112, 3.1483854888224894, 0, 1.4031754622126809, 0.7034023315751566, 2.632390988833372]
best_answer ranked (left = best): [1, 3, 5, 9, 4, 7, 0, 8, 2, 6]
correct document number: 1
######################################################
Question_ID: 55644043
scores: [3.227749018098442, 3.8080985908220084, 2.759623308169225, 1.1454127949837754, 0.45419687806452474, 4.492685300038373, 2.2696820854488875, 2.759623308169225, 0, 3.1884951931420966]
best_answer ranked (left = best): [5, 1, 0, 9, 2, 7, 6, 3, 4, 8]
correct document number: 2
##################

# Formatting Data for Modeling (TEXT & CODE - question_title & answer))
For each unique question, we want X number of answers to calculate BM-25 scores for.   
Highest BM-25 score means that the answer is most similar to the question.  

In [9]:
df.columns

Index(['question_id', 'question_title_body', 'question_title_body_text',
       'question_title_raw', 'question_body', 'question_body_text',
       'answer_id', 'answer_body', 'answer_text', 'is_accepted_answer'],
      dtype='object')

In [10]:
""" 
Data Structure

A dictionary where:
key = tuple (question_id, question_title_body_text)
value = tuple (list of answer_body, list of is_accepted_answer 1 or 0)

"""
start=time.time()

unique_question_ids = np.array(df['question_id'].unique())   #array of unique question_ids
data = dict()                                                #initialize dictionary for data structure
questions=len(unique_question_ids)                           #initialize how many questions to run script for

for q_id in unique_question_ids[0:questions]:

  answer_list = []  #list of answer_texts
  accepted_answer = [] #list of whether answer was accepted or not (1's and 0's)

  #Add answers for each question into answer_list and accepted_answer list
  #Use if condition to only append if it has not already been appended (avoids duplicate answer_text)
  for each in df[df.question_id == q_id].index:
    if df.answer_body[each] not in answer_list:
      answer_list.append(df.answer_body[each])
      accepted_answer.append(df.is_accepted_answer[each])

  #Fill in additional random answers per question (if there are less than X answers)
  X=10
  #if size of answer_list is less than X, then add random answer
  while len(answer_list) < X:
    add_answer = random.sample(range(0,df.shape[0]), X-len(answer_list))  #randomly pick answer from dataset
    for add in add_answer:
      answer_list.append(df.answer_body[add])
  #also remember to add 0 b/c these answers are not the accepted answer for that question_id
  while len(accepted_answer) < X:
    accepted_answer.append(0)

  #Add to dictionary
  question_content = df[df.question_id == q_id].question_title_body.tolist()[0]
  data[(q_id, question_content) ] = (answer_list,accepted_answer)

#Print 5 results to check
bhead=5
count1,count2=0,0
for each in data.keys():
  print(each)
  count1+=1
  if count1==head: break
print("")
for each in data.keys():
  print(data[each])
  count2+=1
  if count2==head: break
print("length of data:", len(data))

end=time.time()
print("Time this took: %0.2f seconds" % (end-start))

(55661929, 'How to display 5 numbers per line from a list?\nlx = [1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25] \n\ndef display(lx):\n\n    for i in range(0,len(lx), 5):\n        x = lx[i:i +5]\n    return x\n\nprint(display(lx))\n\nmy current code is displaying only one line that contains 5 numbers, and the expected should be 5 lines that contains 5 numbers per line')
(55664544, "so I have a list of dicts that looks like this: \n[{\n  'field': {\n    'data': 'F1'\n  },\n  'value': F1Value1,\n  'date': datetime.datetime(2019, 3, 1, 0, 0)\n}, {\n  'field': {\n    'data': 'F2'\n  },\n  'value': F2Value1,\n  'date': datetime.datetime(2019, 2, 5, 0, 0)\n}, {\n  'field': {\n    'data': 'F2'\n  },\n  'value': F2Value2,\n  'date': datetime.datetime(2019, 2, 7, 0, 0)\n}]\n\nAnd I want an output that looks like this: \n[\n  {\n    'F1': [\n      {\n        'value': F1Value1,\n        'date': datetime.datetime(2019, 3, 1, 0, 0)\n      }\n    ]\n  },\n  {\n    'F2': [\n      {\

In [11]:
data[(55661929, 'How to display 5 numbers per line from a list?\nlx = [1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25] \n\ndef display(lx):\n\n    for i in range(0,len(lx), 5):\n        x = lx[i:i +5]\n    return x\n\nprint(display(lx))\n\nmy current code is displaying only one line that contains 5 numbers, and the expected should be 5 lines that contains 5 numbers per line')
]

(["There can be different approaches to the problem:\nx=[]  #your given list\nfor t in range(len(x)):\n    if(t%5==0):\n        print('\\n')\n    print(str(x[t])+'\\t',end='')  #end will print numbers on same line otherwise\n\nthis code will give the desired output..",
  "x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]\n\ndef printer(lis, num_per_line):\n    z = 1\n    for each_num in lis:\n        if z%(num_per_line) == 0:\n            print(each_num, end='\\n')\n        else:\n            print(each_num, end='\\t')\n        z+=1    \n\nprinter(x, 5)\n\nUse this function where you need to pass the list and entities to be printed in a single line.\nThe output is:\n1 2 3 4 5\n6 7 8 9 10\n11 12 13 14 15\n16 17 18 19 20\n",
  "You can make the function a generator by making it yield the sliced lists instead:\ndef display(lx):\n    for i in range(0, len(lx), 5):\n        yield lx[i:i + 5]\n\nprint(*display(lx), sep='\\n')\n\nThis outputs:\n[1, 2, 4, 5, 6]\n[7, 8

In [13]:
df[df.question_id==55661929]

Unnamed: 0,question_id,question_title_body,question_title_body_text,question_title_raw,question_body,question_body_text,answer_id,answer_body,answer_text,is_accepted_answer
0,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55662134,There can be different approaches to the probl...,There can be different approaches to the probl...,0
34,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55662053,"x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",\nUse this function where you need to pass the...,0
105,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,55661981,You can make the function a generator by makin...,You can make the function a generator by makin...,1
361,55661929,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?,How to display 5 numbers per line from a list?...,How to display 5 numbers per line from a list?...,64717925,"lst = [55, 26, 59, 35, 28, 22, 33, 43, 49, 45,...",,0


# BM-25 Baseline (TEXT & CODE)


In [14]:
for k in data.keys():
  asdf = k[1]
  break
type(asdf)

str

In [15]:
#source: https://stackoverflow.com/questions/40966014/how-to-use-gensim-bm25-ranking-in-python
#doc: https://radimrehurek.com/gensim_3.8.3/summarization/bm25.html
# bm25_obj.corpus_size      #number of documents
# bm25_obj.avgdl            #average length of documents    
# bm25_obj.doc_freqs        #dictionary with term frequencies for each document 
# bm25_obj.idf              #inversed documents frequence for whole corpus
# bm25_obj.doc_len          #list of document lengths  
# bm25_obj.get_score()      #(document,index) to get score
# bm25_obj.get_scores()     #get scoress of all documents, e.g. query
# bm25_obj.get_scores_bow() #get BM25 scores given document
# bm25.get_bm25_weights()   #get weights of documents in corpus

from gensim import corpora
from gensim.summarization import bm25

start=time.time()

head=5  #value for how many detailed results to print
head_counter=0  #counter for how many detailed results to print

correct_counter=0
incorrect_counter=0
total_questions=0
mrr_list = []

for key in data.keys():
  
  docs = data[key][0]                        #docs is a list of all documents (answers) for BM-25
  texts = [str(doc).split() for doc in docs] #need to split document into tokens
  query = key[1]                             #query is the question that will be compared against all answers  

  dictionary = corpora.Dictionary(texts)
  corpus = [dictionary.doc2bow(text) for text in texts] #converts texts into tuples of (token number, frequency)
  query_doc = dictionary.doc2bow(query.split())         #converts query into tuples of (token number, frequency)

  bm25_obj = bm25.BM25(corpus)
  scores = bm25_obj.get_scores(query_doc)
  best_docs = sorted(range(len(scores)), key=lambda i: -scores[i])

  #Determine location of accepted answer:
  correct_position = 0
  for num in data[key][1]:
    if num == 0:
      correct_position += 1
    else:
      break

  #Keep Track of correct and incorrect predictions
  if correct_position == best_docs[0]:
    correct_counter +=1
  else:
    incorrect_counter += 1
  total_questions+=1

  #Keep track of Rank of answer for calculating MRR
  rank=1
  for r in best_docs:
    if r != correct_position:
      rank += 1
    else:
      break  
  mrr_list.append(1/rank)

  if head_counter <= head: 
    print("Question_ID:", key[0])
    print("scores:",scores)
    print("best_answer ranked (left = best):", best_docs)
    print("correct document number:", correct_position)
    print("######################################################")
  head_counter+=1


#Printing Accuracy
print("Number correct:", correct_counter)
print("Number incorrect", incorrect_counter)
print("Total:", total_questions)
print("Precision: %0.6f %%" %(correct_counter/total_questions*100))

#Calculating/Printing MRR
mrr = sum(mrr_list)/len(mrr_list)
print(mrr_list)
print("MRR:", mrr) 


end=time.time()
print("Time this took: %0.6f seconds" % (end-start))

Question_ID: 55661929
scores: [4.77857899419487, 4.010495700378776, 7.414680099912423, 3.4628496083543934, 2.945616454432625, 0.7784810616677527, 2.319468779275967, 1.701021577331855, 5.57187289149468, 3.0435412360751872]
best_answer ranked (left = best): [2, 8, 0, 1, 3, 9, 4, 6, 7, 5]
correct document number: 2
######################################################
Question_ID: 55664544
scores: [1.9930763620207603, 15.077162576111231, 2.6282855230935454, 1.2086007925950786, 0.9464945966106039, 1.2059521039823322, 3.7675670453431276, 3.6389549595112385, 2.989715730854394, 5.185128757843934]
best_answer ranked (left = best): [1, 9, 6, 7, 8, 2, 0, 3, 5, 4]
correct document number: 1
######################################################
Question_ID: 55644043
scores: [18.08780517825604, 9.502846875696594, 4.590491180063563, 4.119441541026317, 6.599178239491165, 5.2899486464649685, 5.099475937790451, 12.056533585471826, 1.202959177817375, 6.953260176239509]
best_answer ranked (left = best)

In [None]:
# """ 
# Data Structure

# A dictionary where:
# key = tuple (question_id, question_body)
# value = tuple (list of answer_body, list of is_accepted_answer 1 or 0)

# """
# start=time.time()

# unique_question_ids = np.array(df['question_id'].unique())   #array of unique question_ids
# data = dict()                                                #initialize dictionary for data structure
# questions=10                                                #initialize how many questions to run script for

# for question_id in unique_question_ids[0:questions]:

#   answer_list = set()  #set of answer_texts (using set will reject duplicate answers)
#   accepted_answer = [] #list of whether answer was accepted or not

#   #Add answers for each question into answer_list set() and accepted_answer list()
#   for each in df[df.question_id == question_id].index:
#     answer_list.add(df.answer_text[each])
    
#     #Using following if statement to compare length would confirm answer_list and accepted_answer will be same size
#     #(avoids adding to accepted_answers in case a duplicate answer was not added to answer_list)
#     if len(answer_list) == len(accepted_answer)+1:
#       if df.is_accepted_answer[each] == True:
#         accepted_answer.append(1)
#       else:
#         accepted_answer.append(0)

#   #Fill in additional random answers per question (if there are less than X answers)
#   X=10
#   #if size of answer_list is less than X, then add random answer
#   while len(answer_list) < X:
#     add_answer = random.sample(range(0,df.shape[0]), X-len(answer_list))  #randomly pick answer from dataset
#     for add in add_answer:
#       answer_list.add(df.answer_text[add])
#   #also remember to add 0 b/c these answers are not the accepted answer for that question_id
#   while len(accepted_answer) < X:
#     accepted_answer.append(0)

#   #Add to dictionary
#   data[(question_id, df[df.question_id == question_id].question_content_text.tolist()[0])] = (list(answer_list),accepted_answer)

# #Print 5 results to check
# head=5
# count1,count2=0,0
# for each in data.keys():
#   print(each)
#   count1+=1
#   if count1==head: break
# print("")
# for each in data.keys():
#   print(data[each])
#   count2+=1
#   if count2==head: break
# print("length of data:", len(data))

# end=time.time()
# print("Time this took: %0.2f seconds" % (end-start))

In [None]:
# #source: https://stackoverflow.com/questions/40966014/how-to-use-gensim-bm25-ranking-in-python
# #doc: https://radimrehurek.com/gensim_3.8.3/summarization/bm25.html
# # bm25_obj.corpus_size      #number of documents
# # bm25_obj.avgdl            #average length of documents    
# # bm25_obj.doc_freqs        #dictionary with term frequencies for each document 
# # bm25_obj.idf              #inversed documents frequence for whole corpus
# # bm25_obj.doc_len          #list of document lengths  
# # bm25_obj.get_score()      #(document,index) to get score
# # bm25_obj.get_scores()     #get scoress of all documents, e.g. query
# # bm25_obj.get_scores_bow() #get BM25 scores given document
# # bm25.get_bm25_weights()   #get weights of documents in corpus

# from gensim import corpora
# from gensim.summarization import bm25

# start=time.time()

# head=5  #value for how many detailed results to print
# head_counter=0  #counter for how many detailed results to print

# correct_counter=0
# incorrect_counter=0
# total_questions=0
# mrr_list = []

# for key in data.keys():
  
#   docs = data[key][0]                        #docs is a list of all documents (answers) for BM-25
#   texts = [str(doc).split() for doc in docs] #need to split document into tokens
#   query = key[1]                             #query is the question that will be compared against all answers  

#   dictionary = corpora.Dictionary(texts)
#   corpus = [dictionary.doc2bow(text) for text in texts] #converts texts into tuples of (token number, frequency)
#   query_doc = dictionary.doc2bow(query.split())         #converts query into tuples of (token number, frequency)

#   bm25_obj = bm25.BM25(corpus)
#   scores = bm25_obj.get_scores(query_doc)
#   best_docs = sorted(range(len(scores)), key=lambda i: -scores[i])

#   #Determine location of accepted answer:
#   correct_position = 0
#   for num in data[key][1]:
#     if num == 0:
#       correct_position += 1
#     else:
#       break

#   #Keep Track of correct and incorrect predictions
#   if correct_position == best_docs[0]:
#     correct_counter +=1
#   else:
#     incorrect_counter += 1
#   total_questions+=1

#   #Keep track of Rank of answer for calculating MRR
#   rank=1
#   for r in best_docs:
#     if r != correct_position:
#       rank += 1
#     else:
#       break  
#   mrr_list.append(1/rank)

#   if head_counter <= head: 
#     print("Question_ID:", key[0])
#     print("scores:",scores)
#     print("best_answer ranked (left = best):", best_docs)
#     print("correct document number:", correct_position)
#     print("######################################################")
#   head_counter+=1


# #Printing Accuracy
# print("Number correct:", correct_counter)
# print("Number incorrect", incorrect_counter)
# print("Total:", total_questions)
# print("Accuracy: %0.2f %%" %(correct_counter/total_questions*100))

# #Calculating/Printing MRR
# mrr = sum(mrr_list)/len(mrr_list)
# print(mrr_list)
# print("MRR:", mrr)


# end=time.time()
# print("Time this took: %0.2f seconds" % (end-start))

In [None]:
corpus