<a href="https://colab.research.google.com/github/sreedeepack/Q-Exchange/blob/main/IR%20workbench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

In [9]:
%pip install jsonlines

Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


In [15]:
import pandas as pd
import nltk
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from math import log10
import jsonlines

## For Solr

In [10]:
import jsonlines
from datetime import datetime

def solr_clean(file):
    """
    Return a generator for cleaned jsons from a file.
    To be used with solr
    """
    with jsonlines.open(file) as reader:
        for obj in reader:
            item = {}
            item['url'] = obj['url']
            item['src'] = obj['src']
            item['title'] = clean_str(obj['title'])
            try:
                item['desc'] = clean_str(obj['desc'])
                item['tags'] = obj['tags']
                item['answers'] = obj['answers']

            except KeyError:
                item['desc'] = ""
                item['tags'] = ["reddit"]
                item['answers'] = obj['comments']


            item['votes'] = obj['votes']

            item['date'] = obj['date']
            if item['date'] == 'NA':
                item['date'] = datetime.today().isoformat()

            yield item            


In [9]:
import json

a_file = open("solr.jsonl", "a")

for json_obj in solr_clean("data/stack.jl"):
    json.dump(json_obj, a_file)
    a_file.write("\n")

for json_obj in solr_clean("data/reddit.jl"):
    json.dump(json_obj, a_file)
    a_file.write("\n")

a_file.close()


# Indexing

In [133]:
from functools import reduce

import pandas as pd
import nltk
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from math import log10
import jsonlines


class Preprocessor(object):
    """
    Cleans, removes stopwords and tokenizes lines
    """

    def __init__(self):
        # Stopwords
        nltk.download('stopwords', quiet=True, raise_on_error=True)
        # Sentence Tokenizer
        nltk.download('punkt', quiet=True, raise_on_error=True)

        self._tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))
        self._stop_words = set(nltk.corpus.stopwords.words('english'))

        # Porter stemmer
        self.stemmer = nltk.stem.PorterStemmer()

    def stem_word(self, word):
        return self.stemmer.stem(word)  

    def tokenize_string(self, line):
        tokens = nltk.word_tokenize(line)
        tokens = (self.stem_word(token) for token in tokens)
        tokens = [token for token in tokens if token.isalnum()]
        return list(tokens)      

    def word_split(self, text):
        """
        Split a text in words. Returns a list of tuple that contains
        (word, location) location is the starting byte position of the word.
        """
        word_list = []
        w_current = []
        w_index = None

        for i, c in enumerate(text):
            if c.isalnum():
                w_current.append(c)
                w_index = i
            elif w_current:
                word = u''.join(w_current)
                word_list.append((w_index - len(word) + 1, word))
                w_current = []

        if w_current:
            word = u''.join(w_current)
            word_list.append((w_index - len(word) + 1, word))

        return word_list

    def words_cleanup(self, words):
        """
        Stems words and removes
        words with length less then a minimum and stopwords.
        """
        cleaned_words = []
        for index, word in words:
        
            if len(word) < 3 or word in self._stop_words or word in self._tokenized_stop_words:
                continue
            word = self.stem_word(word)
            cleaned_words.append((index, word))
        return cleaned_words
   
    def word_index(self, text):
        """
        Just a helper method to process a text.
        It calls word split, normalize and cleanup.
        """
        words = self.word_split(text)
        words = self.words_cleanup(words)

        return words

class Indexer(object):

    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def inverted_index(self, text):
        """
        Create an Inverted-Index of the specified text document.
            {word:[locations]}
        """
        inverted = {}

        for index, word in self.preprocessor.word_index(text):
            locations = inverted.setdefault(word, [])
            locations.append(index)

        return inverted

    @staticmethod
    def inverted_index_add(inverted, doc_id, doc_index):
        """
        Add Inverted-Index doc_index of the document doc_id to the
        Multi-Document Inverted-Index (inverted),
        using doc_id as document identifier.
            {word:{doc_id:[locations]}}
        """
        for word, locations in doc_index.items():
            indices = inverted.setdefault(word, {})
            indices[doc_id] = locations
        return inverted

    def search(self, inverted, query):
        """
        Returns a set of documents id that contains all the words in your query.
        """
        words = [word for _, word in self.preprocessor.word_index(query) if word in inverted]
        results = [set(inverted[word].keys()) for word in words]
        return reduce(lambda x, y: x & y, results) if results else []


In [134]:
import jsonlines

def clean_str(text) :
        text = (text.encode('ascii', 'ignore')).decode("utf-8")
        text = re.sub("&.*?;", "", text)
        text = re.sub("[\]\|\[\@\,\$\%\*\&\\\(\)\":]", "", text)
        text = re.sub("-", " ", text)
        text = re.sub("\.+", "", text)
        text = re.sub("^\s+","" ,text)
        text = re.sub("\.+", "", text)
        text = text.lower()
        return text

def document_generator(file):
    tokenizer = Tokenizer()
    with jsonlines.open(file) as reader:
        for id, obj in enumerate(reader):
            item = {}
            item['doc_id'] = id
            item['url'] = obj['url']
            item['title'] = clean_str(obj['title'])
            item['desc'] = clean_str(obj['desc'])

            yield item


In [None]:
inverted = defaultdict()
doc_map = {}

p = Preprocessor()
indexer = Indexer(p)

for doc in document_generator("solr.jsonl"):

    doc_id = doc['doc_id']   
    text = doc['title'] + " " +doc['desc']

    # doc_map[doc_id] = (doc['url'], doc['title'])
    doc_map[doc_id] = (doc['url'], text)

    doc_index = indexer.inverted_index(text)
    indexer.inverted_index_add(inverted, doc_id, doc_index)

# Print Inverted-Index
i = 0
for word, doc_locations in inverted.items():
    print(word, doc_locations)
    i += 1
    if i>15:
        break

In [136]:
# Search something and print results
queries = ['dolby', 'vim emacs', 'github week']

for query in queries:
    
    tokenized_query = ' '.join(p.tokenize_string(query))
    result_docs = indexer.search(inverted, tokenized_query)
    print(f"Search for '{query}': doc_ids={result_docs}")
    
    for _, word in word_index(tokenized_query):
        def extract_text(doc, position): 
            return doc_map[doc][1][position:position+30].replace("\n", ' ')

        for doc_id in result_docs:
            for position in inverted[word][doc_id]:
                print(
                    f"\t - {extract_text(doc_id, position)}..."
                    f"\n\t -->{doc_map[doc_id][0]}"
                )

Search for 'dolby': doc_ids={11074}
	 - dolby digital expires at midni...
	 -->/r/programming/comments/60b7kv/the_last_patent_on_ac3_dolby_digital_expires_at/
Search for 'vim emacs': doc_ids={3942, 2058, 2672, 3315, 3732, 4221, 1431, 2587, 2877}
	 - vim running inside gnome termi...
	 -->https://askubuntu.com/questions/48299/what-ides-are-available-for-ubuntu
	 - vim splits or extra tabs you l...
	 -->https://askubuntu.com/questions/48299/what-ides-are-available-for-ubuntu
	 - vim emacs nano gedit kate to n...
	 -->https://askubuntu.com/questions/68918/how-do-i-restrict-my-kids-computing-time
	 - vim or emacs to write c code  ...
	 -->https://askubuntu.com/questions/61408/what-is-a-command-to-compile-and-run-c-programs
	 - vim is amazing! vim is a highl...
	 -->https://askubuntu.com/questions/10998/what-developer-text-editors-are-available-for-ubuntu
	 - vim is a highly configurable t...
	 -->https://askubuntu.com/questions/10998/what-developer-text-editors-are-available-for-ubuntu
	 -