<a href="https://colab.research.google.com/github/shivammehta007/Information-Retrieval/blob/master/MLinRanking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Application of ML in Ranking

# Application of ML in Ranking of Search Engines

## Downloading Data From Kaggle
#### Might Take a while

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Download Data
!pip install kaggle
import json
import os

with open('/content/drive/My Drive/kaggle.json') as kc:
    kaggle_config = json.load(kc)

os.environ['KAGGLE_USERNAME'] = kaggle_config['username'] 
os.environ['KAGGLE_KEY'] = kaggle_config['key'] 

# !kaggle competitions download -c text-relevance-competition-ir-2-itmo-fall-2019
# !unzip docs.tsv.zip
# !rm docs.tsv.zip



## Code

In [3]:
!pip install -U tqdm
import os
import gzip
import nltk
import pickle
import logging
import itertools
import re
import time

from nltk.corpus import stopwords
nltk.download("stopwords")

from string import punctuation
from collections import defaultdict
from tqdm.notebook import tqdm

logging.basicConfig(level=logging.DEBUG)
logging.debug('Test Logger')

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.41.0)


DEBUG:root:Test Logger


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
indexing_data_location = '/content/docs.tsv'
queries = '/content/queries.numerate.txt'

### Building Inverted Index

#### Inverted Indexer Class

In [0]:
class InvertedIndexer:
    """
    This class makes inverted index
    """

    def __init__(self, filename=False):
        self.filename = filename
        self.stemmer_ru = nltk.SnowballStemmer("russian")
        self.stopwords = set(stopwords.words("russian")) | set(stopwords.words("english"))
        self.punctuation = punctuation # from string import punctuation
        if filename:
            self.inverted_index = self._build_index(self.filename)
        else:
            self.inverted_index = defaultdict(set)

    def preprocess(self, sentence):
        """
        Method to remove stop words and punctuations return tokens
        """
        NONTEXT = re.compile('[^0-9 a-z#+_а-яё]')

        sentence = sentence.lower()
        sentence = sentence.translate(str.maketrans('', '', punctuation))
        sentence = re.sub(NONTEXT,'',sentence)

        # Heavy Operation Taking lot of time will move it outside
        # tokens = [self.stemmer_ru.stem(word) for word in sentence.split()]

        tokens = [token for token in sentence.split() if token not in self.stopwords]

        return tokens

    def stem_keys(self, inverted_index):
        """
        Called after index is built to stem all the keys and normalize them
        """
        logging.debug('Indexing Complete will not Stem keys and remap indexes')
        temp_dict = defaultdict(set)
        i = 0
        for word in tqdm(inverted_index):

            stemmed_key = self.stemmer_ru.stem(word)
            temp_dict[stemmed_key].update(inverted_index[word])
            inverted_index[word] = None

        inverted_index = temp_dict
        logging.debug('Done Stemmping Indexes')
        return inverted_index

    def _build_index(self, indexing_data_location):
        """
        This method builds the inverted index and returns the invrted index dictionary
        """
        inverted_index = defaultdict(set)
        with open(indexing_data_location, "rb") as f:
            for line in tqdm(f):
                line = line.decode().split('\t')
                file_number = line[0]
                subject = line[1]
                text = line[2]
                line = subject + ' ' + text

                for word in self.preprocess(line):
                        inverted_index[word].add(int(file_number))

        # Intermediate Save
        # with open('/content/intermediate_inverted_index.pickle', mode='wb') as f:
        #     pickle.dump(inverted_index, f)
        # Collab crashed don't do that 

        inverted_index = self.stem_keys(inverted_index)

        return inverted_index

    def save(self, filename_to_save):
        """
        Save method to save the inverted indexes
        """
        with open(filename_to_save, mode='wb') as f:
            pickle.dump(self.inverted_index, f)

    def load(self, filelocation_to_load):
        """
        Load method to load the inverted indexes
        """
        with open(filelocation_to_load, mode='rb') as f:
            self.inverted_index = pickle.load(f)

#### SolutionPredictor Class

In [0]:
class SolutionPredictor:
    """
    This classes uses object of Hw1SolutionIndexer
    to make boolean search
    """
    def __init__(self, indexer):
        """
        indexer : object of class Hw1SolutionIndexer
        """
        self.indexer = indexer


    def find_docs(self, query):
        """
        This method provides booleaen search
        query : string with text of query
        Returns Python set with documents which contain query words
        Should return maximum 100 docs
        """
        tokens = self.indexer.preprocess(query)
        tokens = [self.indexer.stemmer_ru.stem(word) for word in tokens]
        docs_list = set()
        for word in tokens:
            if len(docs_list) > 0:
                docs_list.intersection_update(self.indexer.inverted_index[word])
            else:
                docs_list.update(self.indexer.inverted_index[word])

        return set(itertools.islice(docs_list, 100))

#### Generate Index

In [0]:
logging.debug('Index is creating...')
start = time.time()
new_index = InvertedIndexer(indexing_data_location) 
end = time.time()
logging.debug('Index has been created and saved as inverted_index.pickle in {:.4f}s'.format(end-start))

DEBUG:root:Index is creating...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [0]:
len(new_index.inverted_index)

In [0]:
test_sentence = 'бандиты боятся ли ментов'