In [10]:
"""
Reading 1000 entries of the patent database and storing
the descriptions in a list 
"""
import pickle
data = pickle.load(open("/content/drive/MyDrive/bigPatentData/data_pik", 'rb'))
#print(data["description"])
des = data["description"][0:200].tolist()

In [None]:
"""
Aplying preprocessing on the data
"""

#%cd "drive/My Drive/Colab Notebooks"
!pip install import-ipynb
import import_ipynb

from IR_Assignment1_preprocessing import preprocess

preprocessed = []
for i in des:
  preprocessed.append(preprocess(i))

print(preprocessed[0])

In [None]:
"""
Basic inverted index construction
format: term: doc1, doc2,..
where doc1 is the position of the doc in the dataframe
"""
import collections
index = {}

numfiles = len(preprocessed)
print(numfiles)
for i in range(numfiles):
  for term in preprocessed[i]:
    if term not in index:
      index[term] = [i]
    elif (index[term])[-1]!=i:
      (index[term]).append(i)

index = collections.OrderedDict(sorted(index.items()))

In [None]:
"""
Dumping the index in a file
"""
%cd "drive/My Drive"
with open('index.txt', 'wb') as fp:
    pickle.dump(index, fp)

In [None]:
"""
Positional index construction
"""
import collections
index2 = {} #dictionary of dictionaries (not for now)

numfiles = len(preprocessed)
print(numfiles)
for i in range(numfiles):
  occurences = {}
  for t in range(len(preprocessed[i])):
    term = preprocessed[i][t]
    if term not in occurences:
      occurences[term] = [t]
    else:
      occurences[term].append(t)

  for term in occurences:
    positions = occurences[term]
    if term in index2:
      index2[term].append((i, positions))
      #index2[term][i] = positions
    else:
      index2[term] = ([i, positions)]

index2 = collections.OrderedDict(sorted(index2.items()))

In [None]:
"""
Dumping the index in a file
"""
with open('index2.txt', 'wb') as fp:
    pickle.dump(index2, fp)

In [None]:
ind1 = pickle.load(open("/content/drive/MyDrive/index.txt", 'rb'))
print(ind1["contain"])

In [None]:
ind2 = pickle.load(open("/content/drive/MyDrive/index2.txt", 'rb'))
print(ind2["contain"])

In [21]:
import numpy as np

def get_docs(dict_, term):
  if term not in dict_:
    return np.array([])
  return np.array(dict_[term])

In [None]:
"""
AND query
"""
import time
%cd "drive/My Drive/Colab Notebooks"
from IR_Assignment1_preprocessing import preprocess

def and_query(query):
  start = time.time()
  query = preprocess(query)

  term1 = query[0]
  docs1 = get_docs(ind1, term1)
  n = len(query)
  ans=[]
  for i in range(n-1):
    term2 = query[i+1]
    docs2 = get_docs(ind1, term2)
    i1 = 0
    i2 = 0
    ans = []
    while i1 < len(docs1) and i2 < len(docs2):
      if docs1[i1] == docs2[i2]:
        ans.append(docs1[i1])
        i1 += 1
        i2 += 1
      elif docs1[i1] < docs2[i2]:
        i1 += 1
      elif docs2[i2] < docs1[i1]:
        i2 += 1
    docs1 = [i for i in ans]
  end = time.time()
  print("Documents retrieved:",ans)
  print("Number of docs:", len(ans))
  print("Time taken:", end-start)
  return len(ans),end-start


In [23]:
"""
Cosine similarity Ranking
"""
import math
import operator

def cosine_similarity(query):
  start = time.time()
  query = preprocess(query)
  scores = {}
  length_query = 0

  query_freq = {}
  for i in query:
    if i in query_freq:
      query_freq[i]+=1
    else:
      query_freq[i]=1 

  n = 10000
  length_vector = {}
  for query_term in query_freq:
    term_tf = query_freq[query_term]
    if query_term in ind2:
      term_postings = ind2[query_term]
      freq = len(term_postings)
      idf = math.log10(float(n)/float(freq))
      weight_query = term_tf * idf
    else:
      term_postings = []
      weight_query = 0  
    length_query += math.pow(weight_query, 2)
    relevant_docs = []

    for (doc_id, positions) in term_postings:
      relevant_docs.append(doc_id)
      tf = len(positions)
      if tf==0:
        log_tf = 0
      else:
        log_tf = 1+math.log(tf)
      length_vector[doc_id] = length_vector.get(doc_id,0.0) + (log_tf*log_tf)
      weight_d_t = float(tf)
      if doc_id in scores:
        scores[doc_id] += weight_d_t * weight_query
      else:
        scores[doc_id] = weight_d_t * weight_query

  length_query = math.sqrt(length_query)
      
  for doc_id in relevant_docs:
    scores[doc_id] = scores[doc_id]/(math.sqrt(length_vector[doc_id])*length_query)

  ordered_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True);
  end = time.time()
  n_sc = len(ordered_scores)
  print("Documents retrieved:",ordered_scores[0:min(25,n_sc)])
  print("Number of docs:", 20)
  print("Time taken:", end-start)
  return end-start


In [None]:
ndocs = []
andtimes = []
costimes = []
queries = ["fan cold cooling rotate metal steel cover heating", "dentist typically supplies the technician with a full face photograph", "may be straight - chained or branched . an optionally substituted alkyl", "interchangeable retail display in accordance with the invention", "numerous specific details are set forth in order to provide a thorough", "effects of garlic extracts containing allicin for prostate tumor treatment", "cooking device 10 will now be described with respect to the figures", "figures are not drawn to scale and they are provided merely ", "accordance with the present invention will be described","drawings wherein like numerals refer to like matter throughout"]
for q in queries:
  print(q)
  print("And:")
  ndoc, andtime = and_query(q)
  ndocs.append(ndoc)
  andtimes.append(andtime)
  print("Cosine Similarity")
  costime = cosine_similarity(q)
  costimes.append(costime)

In [None]:
print(ndocs)
print(andtimes)
print(costimes)

In [None]:
index2 = {}
for i in ind2:
  index2[i] = {}
  for j in ind2[i]:
    index2[i][j[0]]=j[1]

In [None]:
"""
Positional bigram query
"""
import time
%cd "drive/My Drive/Colab Notebooks"
!pip install import-ipynb
import import_ipynb

from IR_Assignment1_preprocessing import preprocess

def and_query(query):
  start = time.time()
  query = preprocess(query)

  term1 = query[0]
  docs1 = get_docs(ind1, term1)
  n = len(query)
  ans=[]
  for i in range(n-1):
    term2 = query[i+1]
    docs2 = get_docs(ind1, term2)
    i1 = 0
    i2 = 0
    ans = []
    while i1 < len(docs1) and i2 < len(docs2):
      if docs1[i1] == docs2[i2]:
        pos1 = set(index2[term1][docs1[i1]])
        pos2 = set(index2[term2][docs2[i2]])
        for i in pos1:
          if i+1 in pos2:
            ans.append(docs1[i1])
            i1 += 1
            i2 += 1
            break
        i1+=1
        i2+=1
      elif docs1[i1] < docs2[i2]:
        i1 += 1
      elif docs2[i2] < docs1[i1]:
        i2 += 1
    term1 = term2
    docs1 = [i for i in ans]
  end = time.time()
  print("Documents retrieved:",ans)
  print("Number of docs:", len(ans))
  print("Time taken:", end-start)
  return len(ans),end-start

ndocs = []
times = []
queries = ["fan cold cooling rotate metal steel cover heating","dentist typically supplies the technician with a full face photograph", "may be straight - chained or branched . an optionally substituted alkyl", "interchangeable retail display in accordance with the invention", "numerous specific details are set forth in order to provide a thorough", "effects of garlic extracts containing allicin for prostate tumor treatment", "cooking device 10 will now be described with respect to the figures", "figures are not drawn to scale and they are provided merely ", "accordance with the present invention will be described","drawings wherein like numerals refer to like matter throughout"]
for q in queries:
  print(q)
  print("And:")
  ndoc, andtime = and_query(q)
  ndocs.append(ndoc)
  times.append(andtime)