In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
from dataclasses import dataclass
from typing import List,DefaultDict,Dict
from collections import defaultdict
import pathlib
import nltk
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
@dataclass 
class TermDetails:
  doc_id:str
  positions:List[str]
  term_frequency:int=0


In [4]:
#### CREATING DATA STRUCTURE #####
positional_index:DefaultDict[str,Dict[str,TermDetails]]=defaultdict(dict)

In [5]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.66-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 7.6 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 65.9 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.0 contractions-0.1.66 pyahocorasick-1.4.4 textsearch-0.0.21


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [9]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [10]:
##### PRE PROCESSING OF THE TEXT #####

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_underscores(words):
  new_words = []
  for word in words:
      new_word = re.sub(r'_{2,}', '', word)
      if new_word != '':
          new_words.append(new_word)
  return new_words


def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:

        if word.isdigit():
          try:
            new_word = p.number_to_words(word)
            new_words.append(new_word)
          except:
            pass
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = remove_underscores(words)
    return words
  
def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

In [11]:
##### CREATING DATA STRUCTURE #####
current_directory=pathlib.Path(os.path.abspath(""))
path=current_directory / pathlib.Path("/content/drive/MyDrive/IR_A1_Dataset")
for filename in path.glob("**/*"):
  if not filename.is_file():
    continue
  with filename.open('rb') as f:
    doc_id=filename.name
    raw_text = f.read()
    raw_text = denoise_text(raw_text)
    raw_text = replace_contractions(raw_text)
    words = nltk.word_tokenize(raw_text)
    words = normalize(words)
    stems, lemmas = stem_and_lemmatize(words)
    cleaned_text = " ".join(lemmas)
    for lemma in lemmas:
      positions = []
      for match in re.finditer(lemma,cleaned_text):
        positions.append(match.start())
      if lemma not in positional_index:

        positional_index[lemma]={doc_id:TermDetails(doc_id=doc_id, positions=positions, term_frequency=len(positions))}
      else:
        if doc_id in positional_index[lemma]:
          positional_index[lemma][doc_id].positions.extend(positions)
          positional_index[lemma][doc_id].term_frequency += len(positions)
        else:
          positional_index[lemma][doc_id] = TermDetails(doc_id=doc_id, positions=positions, term_frequency=len(positions))
    # print(filename)
    


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [None]:
# import json

# with open('/content/drive/MyDrive/cleaned.json','w') as f:
#   json.dump(positional_index,f,default=lambda o:o.__dict__)


In [17]:
## FUNCTION TO FIND THE DOCUMENTS FROM THE DATA STRUCTURE ####
def phrase_query(phrase):
  starting_dict_keys=set(positional_index[phrase[0]].keys())
  # print(starting_dict_keys)
  # common_keys=copy.deepcopy()
  for i in range(1,len(phrase)):
    starting_dict_keys=set(positional_index[phrase[i]].keys()).intersection(set(starting_dict_keys))
    # print(starting_dict_keys)
    removed_keys=set()
    for key in starting_dict_keys:
      
      for k in positional_index[phrase[i]][key].positions:
        # print(k)
        for j in positional_index[phrase[i-1]][key].positions:
          # print(j,k)
          if(k==j+len(phrase[i-1])+1):
            # print(j)
            k+=1
            break
        else:
          continue
        break
      else:
        removed_keys.add(key)
    for el in removed_keys:
      starting_dict_keys.remove(el)
  print("number of documents retrieved: ",len(starting_dict_keys))
  print("list of documents retrieved:")    
  print(starting_dict_keys)
            
        



In [18]:
# Function to search the string ####
def searchDocs(string):
  raw_text = denoise_text(string)

  raw_text = replace_contractions(raw_text)


  words = nltk.word_tokenize(raw_text)

  words = normalize(words)
  # print(words)
  stems, lemmas = stem_and_lemmatize(words)
  # print(lemmas)
  phrase_query(lemmas)

In [21]:
##EXAMPLE###
string = input("Input the string you want to search: ")
 
searchDocs(string)

Input the string you want to search: came out
number of documents retrieved:  630
list of documents retrieved:
{'coffee.txt', 'miamadvi.hum', 'drive.txt', 'prac4.jok', 'a_tv_t-p.com', 'legal.hum', 'deadlysins.txt', 'mr.rogers', 'jason.fun', 'zen.txt', 'planeget.hum', 'bnb_quot.txt', 'soleleer.hum', 'inquirer.txt', 'byfb.txt', 'trekwes.hum', 'polly.txt', 'various.txt', 'nukeplay.hum', 'normquot.txt', 'mead.rcp', 'hackingcracking.txt', 'variety3.asc', 'hecomes.jok', 'sungenu.hum', 'variety1.asc', 'petshop', 'resolutn.txt', 'tfpoems.hum', 'mash.hum', 'jc-elvis.inf', 'number_k.ill', 'nigel.10', 'rinaldos.txt', 'gd_flybd.txt', 'fajitas.rcp', 'gack!.txt', 'japantv.txt', 'quantum.jok', 'stuf10.txt', 'dieter.txt', 'insult', 'nigel.4', 'aeonint.txt', 'hacktest.txt', 'bbc_vide.cat', 'terms.hum', 'golnar.txt', 'wkrp.epi', 'lobquad.hum', 'butwrong.hum', 'films_gl.txt', 'mindvox', 'passenge.sim', 'readme.bat', 'cybrtrsh.txt', 'htswfren.txt', 'melodram.hum', 'princess.brd', 'humpty.dumpty', 'quotes.