# **Q3. Phrase Queries**

In [None]:
# importing the libraries
from nltk.corpus import stopwords
import collections
import itertools
import pickle
import pprint
import nltk
import os
import re

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# extracting the zip folder of processed data from question 1
!unzip '/content/processed_data_A1.zip'

# **(i) Bigram inverted index**

In [None]:
DATA_DIRECTORY = '/content/CSE508_Winter2023_Dataset'

In [None]:
# function to read a file.
def file_read(file_path):
  with open(file_path, "r") as file:
    data = file.read()
    
  return data

In [None]:
# checking the correctness of the function
file_read('/content/CSE508_Winter2023_Dataset/cranfield0001')

'experimental investigation aerodynamics wing slipstream experimental study wing propeller slipstream made order determine spanwise distribution lift increase due slipstream different angles attack wing different free stream slipstream velocity ratios results intended part evaluation basis different theoretical treatments problem comparative span loading curves together supporting evidence showed substantial part lift increment produced slipstream due destalling boundarylayercontrol effect integrated remaining lift increment subtracting destalling lift found agree well potential flow theory empirical evaluation destalling effects made specific configuration experiment'

In [None]:
# making a dictionary that stores docID and its content.
indexed_file_content = dict()
files = sorted(os.listdir(DATA_DIRECTORY))

# enumerating over all the files in the given directory and store 
# the key as docID (index) and the value as the content of the file
for index, file in enumerate(files):
  file_path =  DATA_DIRECTORY + '/' + file
  data = file_read(file_path)
  indexed_file_content[index+1] = data

In [None]:
# splitting the content of the file, so to make tokens
for i in range(1, len(indexed_file_content)+1):
  indexed_file_content[i] = indexed_file_content[i].split()

In [None]:
bigram_inverted_index = dict()

# create bigram inverted index.
for i in range(1, len(indexed_file_content)+1):
  size = len(indexed_file_content[i])
  for j in range (0, size-1):
    bigram_tuple = (indexed_file_content[i][j], indexed_file_content[i][j+1])
    try:
      bigram_inverted_index[bigram_tuple].add(i)
    except:
      bigram_inverted_index[bigram_tuple] = set()
      bigram_inverted_index[bigram_tuple].add(i)


# now sort the posting lists for each biword 
temp = bigram_inverted_index.keys()
for i in temp:
  bigram_inverted_index[i] = sorted(bigram_inverted_index[i])

In [None]:
for key, value in bigram_inverted_index.items():
  print(key, ' : ', value)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('disturbances', 'flow')  :  [1313]
('nozzle', 'entry')  :  [1313]
('entry', 'caused')  :  [1313]
('caused', 'waves')  :  [1313]
('reflected', 'contact')  :  [1313]
('contact', 'surface')  :  [1313]
('surface', 'operating')  :  [1313]
('conditions', 'depart')  :  [1313]
('depart', 'first')  :  [1313]
('first', 'disturbance')  :  [1313]
('disturbance', 'reflected')  :  [1313]
('surface', 'weak')  :  [1313]
('weak', 'enough')  :  [1313]
('enough', 'tolerated')  :  [1313]
('tolerated', 'within')  :  [1313]
('small', 'range')  :  [1313]
('range', 'primaryshock')  :  [1313]
('primaryshock', 'mach')  :  [1313]
('number', 'eg')  :  [1313]
('eg', '5')  :  [1313]
('5', '7')  :  [1313]
('6', '3')  :  [1313]
('3', 'pressure')  :  [1313]
('pressure', 'entry')  :  [1313]
('entry', 'nozzle')  :  [1313]
('nozzle', 'remain')  :  [1313]
('constant', '10')  :  [1313]
('cent', 'within')  :  [1313]
('range', 'running')  :  [1313]
('times', '

In [None]:
print('No of entries in Bigram Inverted Index: {}'.format(len(bigram_inverted_index)))

No of entries in Bigram Inverted Index: 85114


In [None]:
# dump bigram inverted index to a pickle file
with open('/content/bigram_inverted_index.pkl', 'wb') as file:
  pickle.dump(bigram_inverted_index, file)

# **Pre-Processing Input Query**

In [None]:
# utility method to process the input query, and returns a list of tokens
def preprocess_input(input_str):
  input_str = input_str.lower()
  input_str = re.sub(r'[^\w\s]', '', input_str).split()

  without_stopwords = []
  for word in input_str:
    if word not in stop_words:
      without_stopwords.append(word)

  return without_stopwords

In [None]:
# utility program to find correct documents for positional indexes
def find_correct_docs(query, index_file):
  result = [index_file[word] for word in query]
  keys_list = [set(dictionary.keys()) for dictionary in result]
  doc_ids = set.intersection(*keys_list)
  postings, final_doc = [], []

  for id in doc_ids:
    temp = [index[id] for index in result]
    postings.append({id: temp})

  for document in postings:
    for key, value in document.items():
      flag, leng = False, len(value)
      for i in value[0]:
        temp = i+1
        for j in range(1, leng):
          if temp in value[j]: flag = True
          else: flag = False
          temp += 1

        if flag: final_doc.append(key)
        continue

  return final_doc

In [None]:
# method to return document ids
def search_query(query, index, type):
  result = []
  
  if type == 'biword':
    for i in range(len(query)-1):
      temp_tuple = (query[i], query[i+1])
      for key, value in index.items():
        if key == temp_tuple:
          result.append(set(value))
  
  if type == 'positional':
    return find_correct_docs(query, index)

  if len(result)>0:
    return set.intersection(*result)
  else: 
    return {}

# **(ii) Positional Indexing**

In [None]:
# method to get the positions of a word in the file, returns a list of indexes
def get_positions(word, content):
  positions = []
  for index, wrd in enumerate(content):
    if wrd == word:
      positions.append(index+1)
      
  return positions

In [None]:
# creating the positional indexing
positional_index = dict()

for i in range(1, 1401):
  for word in indexed_file_content[i]:
    positions = get_positions(word,indexed_file_content[i])

    if len(positions) > 0:
      if(word in positional_index):
        positional_index[word].update({i: positions})
      else:
        positional_index[word] = {i: positions}

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(dict(list(positional_index.items())[0: 5]))

{   'aerodynamics': {   1: [3],
                        11: [9],
                        216: [58],
                        225: [11],
                        237: [31],
                        244: [45],
                        284: [46],
                        289: [3],
                        296: [12],
                        297: [46],
                        360: [69],
                        453: [17],
                        634: [61, 67],
                        685: [124],
                        689: [3],
                        753: [44, 73],
                        792: [13],
                        902: [13],
                        1206: [59],
                        1271: [4],
                        1331: [36],
                        1347: [21],
                        1380: [160]},
    'experimental': {   1: [1, 6],
                        11: [49],
                        12: [28],
                        17: [31],
                        19: [28],
                

In [None]:
print('No of entries in Positional Index: {}'.format(len(positional_index)))

No of entries in Positional Index: 8967


In [None]:
# dump positional index to a pickle file
with open('/content/positional_index.pkl', 'wb') as file:
  pickle.dump(positional_index, file)

# **Adding the main function**

In [None]:
if __name__ == '__main__':
  BIGRAM_MODEL_PATH = '/content/bigram_inverted_index.pkl'
  POSITIONAL_MODEL_PATH = '/content/positional_index.pkl'

  with open(BIGRAM_MODEL_PATH, 'rb') as file: bigram_index = pickle.load(file)
  with open(POSITIONAL_MODEL_PATH, 'rb') as file: positional_index = pickle.load(file)

  n = int(input("Enter Number of Queries you want to execute: "))
  queries = []
  for i in range(n):
    query = input("Enter Query {}: ".format(i+1))
    queries.append(query)

  for index, query in enumerate(queries):
    processed_words = preprocess_input(query)
    result_biword = search_query(processed_words, bigram_index, "biword")
    result_positional = search_query(processed_words, positional_index, "positional")

    result_biword = (sorted(set(result_biword)))
    file_names_biword = ["cranfield00"+str(i) for i in result_biword]
    print(f"\nNumber of documents retrieved for query {index+1} using bigram inverted index: {len(file_names_biword)}")
    print(f"Names of documents retrieved for query {index+1} using bigram inverted index: {file_names_biword}")

    result_positional = (sorted(set(result_positional)))
    file_names_positional = ["cranfield00"+str(i) for i in result_positional]
    print(f"\nNumber of documents retrieved for query {index+1} using positional index: {len(file_names_positional)}")
    print(f"Names of documents retrieved for query {index+1} using positional index: {file_names_positional}")   

Enter Number of Queries you want to execute: 3
Enter Query 1: jet Propulsion
Enter Query 2: slipstream Experimental Investigation
Enter Query 3: Transient heAt conducTion

Number of documents retrieved for query 1 using bigram inverted index: 6
Names of documents retrieved for query 1 using bigram inverted index: ['cranfield007 cranfield0040 cranfield00182 cranfield001151 cranfield001211 cranfield001212']

Number of documents retrieved for query 1 using positional index: 6
Names of documents retrieved for query 1 using positional index: ['cranfield007 cranfield0040 cranfield00182 cranfield001151 cranfield001211 cranfield001212']

Number of documents retrieved for query 2 using bigram inverted index: 1
Names of documents retrieved for query 2 using bigram inverted index: ['cranfield001']

Number of documents retrieved for query 2 using positional index: 0
Names of documents retrieved for query 2 using positional index: ['']

Number of documents retrieved for query 3 using bigram inverte

# **(iii) Compare and comment on your results using (i) and (ii)**

For the query 1: **"slipstream experimental Investigation"**\
Bigram Index returns 1 file, but positional index returns 0 file. This is because bigram indexes gives ***FALSE POSITIVE*** results which is not an issue in positional indexing. In the document **"cranfield001"**, (slipstream experimental), and (experimental Investigation) exists but not in the correct order or sequence so it gives this file as output.This is the limitation of bigram indexing and hence we use positional indexing.
