<a href="https://colab.research.google.com/github/shivammehta007/Information-Retrieval/blob/master/RankingIntermediateChanges.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Application of ML in Ranking

# Application of ML in Ranking of Search Engines

## Downloading Data From Kaggle
#### Might Take a while

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Download Data
!pip install kaggle
import json
import os
from pprint import pprint

with open('/content/drive/My Drive/kaggle.json') as kc:
    kaggle_config = json.load(kc)

os.environ['KAGGLE_USERNAME'] = kaggle_config['username'] 
os.environ['KAGGLE_KEY'] = kaggle_config['key'] 

!kaggle competitions download -c text-relevance-competition-ir-2-itmo-fall-2019
!unzip docs.tsv.zip
!rm docs.tsv.zip

Downloading docs.tsv.zip to /content
100% 1.17G/1.17G [00:31<00:00, 39.1MB/s]
100% 1.17G/1.17G [00:31<00:00, 39.7MB/s]
Downloading sample.submission.txt to /content
  0% 0.00/434k [00:00<?, ?B/s]
100% 434k/434k [00:00<00:00, 59.8MB/s]
Downloading queries.numerate.txt to /content
  0% 0.00/31.9k [00:00<?, ?B/s]
100% 31.9k/31.9k [00:00<00:00, 32.2MB/s]
Archive:  docs.tsv.zip
  inflating: docs.tsv                


## Code

In [11]:
!pip install -U tqdm
!pip install rank_bm25
!pip install keyw
import os
import gzip
import nltk
import random
import pickle
import pandas as pd
import logging
import keyw
import itertools
import re
import time

import numpy as np
import pandas as pd

from rank_bm25 import BM25Okapi

from nltk.corpus import stopwords
nltk.download("stopwords")

from string import punctuation
from collections import defaultdict
from collections import OrderedDict
from tqdm.notebook import tqdm

logging.basicConfig(level=logging.DEBUG)
logging.debug('Test Logger')

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.41.1)


DEBUG:root:Test Logger


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
indexing_data_location = '/content/docs.tsv'
queries_list = '/content/queries.numerate.txt'
db_directory_name = 'database'

### Convert Docs from memory load to IO operations

In [0]:
# database = pd.read_csv(indexing_data_location, sep='\t', header=None,
#                        names=['id', 'subject', 'content'],
#                        dtype = {'id': int,'subject': str,
#                             'content': str})

In [0]:
os.mkdir(db_directory_name)

In [7]:
with open(indexing_data_location, "rb") as f:
            for line in tqdm(f):
                line = line.decode().split('\t')
                file_number = line[0]
                subject = line[1]
                text = line[2]
                line = subject + ' ' + text
                with open(os.path.join(db_directory_name, file_number), 'w') as output:
                    output.write(line)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### Building Inverted Index

#### Inverted Indexer Class

In [0]:
class InvertedIndexer:
    """
    This class makes inverted index
    """

    def __init__(self, filename=False):
        self.filename = filename
        self.stemmer_ru = nltk.SnowballStemmer("russian")
        self.stopwords = set(stopwords.words("russian")) | set(stopwords.words("english"))
        self.punctuation = punctuation # from string import punctuation
        if filename:
            self.inverted_index = self._build_index(self.filename)
        else:
            self.inverted_index = defaultdict(set)

    def preprocess(self, sentence):
        """
        Method to remove stop words and punctuations return tokens
        """
        NONTEXT = re.compile('[^0-9 a-z#+_а-яё]')

        sentence = sentence.lower()
        sentence = sentence.translate(str.maketrans('', '', punctuation))
        sentence = re.sub(NONTEXT,'',sentence)

        # Heavy Operation Taking lot of time will move it outside
        # tokens = [self.stemmer_ru.stem(word) for word in sentence.split()]

        tokens = [token for token in sentence.split() if token not in self.stopwords]

        return tokens

    def stem_keys(self, inverted_index):
        """
        Called after index is built to stem all the keys and normalize them
        """
        logging.debug('Indexing Complete will not Stem keys and remap indexes')
        temp_dict = defaultdict(set)
        i = 0
        for word in tqdm(inverted_index):
            stemmed_key = keyw.engrus(word)
            stemmed_key = self.stemmer_ru.stem(stemmed_key)
            temp_dict[stemmed_key].update(inverted_index[word])
            inverted_index[word] = None

        inverted_index = temp_dict
        logging.debug('Done Stemmping Indexes')
        return inverted_index

    def _build_index(self, indexing_data_location):
        """
        This method builds the inverted index and returns the invrted index dictionary
        """
        inverted_index = defaultdict(set)
        with open(indexing_data_location, "rb") as f:
            for line in tqdm(f):
                line = line.decode().split('\t')
                file_number = line[0]
                subject = line[1]
                text = line[2]
                line = subject + ' ' + text

                for word in self.preprocess(line):
                        inverted_index[word].add(int(file_number))

        inverted_index = self.stem_keys(inverted_index)

        return inverted_index

    def save(self, filename_to_save):
        """
        Save method to save the inverted indexes
        """
        with open(filename_to_save, mode='wb') as f:
            pickle.dump(self.inverted_index, f)

    def load(self, filelocation_to_load):
        """
        Load method to load the inverted indexes
        """
        with open(filelocation_to_load, mode='rb') as f:
            self.inverted_index = pickle.load(f)

#### SolutionPredictor Class

In [0]:
class SolutionPredictor:
    """
    This classes uses object of Hw1SolutionIndexer
    to make boolean search
    """
    def __init__(self, indexer):
        """
        indexer : object of class Hw1SolutionIndexer
        """
        self.indexer = indexer


    def find_docs(self, query):
        """
        This method provides booleaen search
        query : string with text of query
        Returns Python set with documents which contain query words
        Should return maximum 100 docs
        """
        query = keyw.engrus(query)
        tokens = self.indexer.preprocess(query)
        tokens = [self.indexer.stemmer_ru.stem(word) for word in tokens]
        docs_list = set()
        for word in tokens:
            if len(docs_list) > 0:
                docs_list.intersection_update(self.indexer.inverted_index[word]) # changed from intersection_update
            else:
                docs_list.update(self.indexer.inverted_index[word])
        
        # Handling case when no set is returned by intersection
        # if len(docs_list) < 10:
        #     for word in tokens:
        #         docs_list.update(self.indexer.inverted_index[word]) # changed from intersection_update

        return docs_list

#### Generate Index 
Run Only when Index is not generated Otherwise use the Loading code block

In [12]:
logging.debug('Index is creating...')
start = time.time()
new_index = InvertedIndexer(indexing_data_location) 
end = time.time()
logging.debug('Index has been created and saved as inverted_index.pickle in {:.4f}s'.format(end-start))

In [13]:
len(new_index.inverted_index)

2869381

In [0]:
new_index.save('inverted_index_all_rus.pickle')

In [0]:
!cp inverted_index.pickle '/content/drive/My Drive/Homeworks/InformationRetrieval/inverted_index_temp.pickle'

#### Load Index
Run when Index is not generated

In [0]:
index_location = '/content/drive/My Drive/Homeworks/InformationRetrieval/inverted_index_4.pickle'

In [13]:
logging.debug('Loading Index...')
start = time.time()
new_index = InvertedIndexer()
new_index.load(index_location)
end = time.time()
logging.debug('Index has been loaded from inverted_index.pickle in {:.4f}s'.format(end-start))

DEBUG:root:Loading Index...
DEBUG:root:Index has been loaded from inverted_index.pickle in 75.6462s


In [14]:
len(new_index.inverted_index)

2858862

##### Testing Block

In [0]:
test_sentence = 'бандиты боятся ли ментов'

In [0]:
boolean_model = SolutionPredictor(new_index)

In [0]:
start = time.time()
print(boolean_model.find_docs(test_sentence))
logging.debug('Search Time: {}'.format(time.time() - start))

DEBUG:root:Search Time: 0.0032525062561035156


{59393, 122885, 92166, 370697, 262166, 477206, 575512, 110616, 145432, 432154, 432166, 247847, 106540, 557108, 421941, 12351, 522305, 190534, 221260, 198735, 284753, 579665, 364632, 505945, 364635, 178274, 313443, 313451, 419951, 489583, 75893, 22646, 57466, 489595, 180351, 106626, 108677, 88199, 221320, 112780, 489618, 489620, 485525, 489622, 221335, 221336, 383136, 364705, 362658, 362662, 204967, 489639, 112810, 245933, 20656, 489651, 577718, 403642, 383163, 61629, 577727, 14529, 403650, 579780, 563400, 577738, 413900, 14540, 577743, 520399, 135378, 379091, 362708, 340183, 92376, 383196, 442590, 221414, 489710, 299250, 155891, 510199, 311545, 311553, 430350, 495888, 92435, 430358, 364825, 416028, 174377, 174380, 10544, 284977, 284978, 452915, 414004, 414006, 311610, 284990, 284993, 381257, 414037, 383317, 108887, 293207, 389467, 98656, 414049, 414054, 346471, 164202, 383339, 328049, 358783, 489860, 108939, 383372, 213389, 311694, 98709, 72093, 489886, 311712, 266657, 184738, 489897, 

In [0]:
test_docs = boolean_model.find_docs(test_sentence)

In [0]:
test_dict = {}

for doc in test_docs:
    with open(os.path.join(db_directory_name, str(doc))) as f:
        test_dict[doc] = f.readlines()[0]

### Loading File to DataFrame
We are doing this for faster search based on index numbers

In [0]:
# %%time
# database = pd.read_csv(indexing_data_location, sep='\t', header=None,
#                        names=['id', 'subject', 'content'],
#                        dtype = {'id': int,'subject': str,
#                             'content': str})

CPU times: user 55.1 s, sys: 8.91 s, total: 1min 3s
Wall time: 1min 52s


In [0]:
# database.head()

Unnamed: 0,id,subject,content
0,24,БОЛЕЕТ ЗА ФУТБОЛ И ТРАХАЕТ ДОЧЬ СОФТ ОРГАНАЙЗЕ...,ГЛАВНАЯ НОВОСТИ КАРТА СОФТ ОРГАНАЙЗЕР СКАЧАТЬ ...
1,26,МОНГОЛЬСКАЯ ТХАНКА СОФТФОН,ГЛАВНАЯ НОВОСТИ КАРТА СОФТФОН GT МОНГОЛЬСКАЯ Т...
2,30,СМОТРЕТЬ ФИЛЬМЫ ОНЛАЙН БЕСПЛАТНО СМОТРЕТЬ КОНЦ...,ЗАГРУЗКА ПОЖАЛУЙСТА ПОДОЖДИТЕ ВНИМАНИЕ ОБНАРУЖ...
3,35,КАК ПРИГОТОВИТЬ ТОРТ РАФАЭЛЛО В ДОМАШНИХ УСЛОВ...,ՀԱՅԵՐԵՆ РУССКИЙ ПОЛИТИКА ВНУТРЕННИЙ ВНЕШНИЙ ЭК...
4,39,МИФЫ О ГВ BREASTFEEDING AM,TOGGLE NAVIGATION BREASTFEEDING AM ГОРЯЧАЯ ЛИН...


### Implementing BM25 (When all data in memory)

In [0]:
# # boolean_model = SolutionPredictor(new_index)
# output=open('submission.csv', 'w')
# with open(queries_list) as queries:
#     output.write('QueryId,DocumentId\n')
#     for line in tqdm(queries.readlines()):
#         line = line.split('\t')
#         id = int(line[0])
#         query = line[1]
#         value_present = boolean_model.find_docs(query)
#         data = database[database['id'].isin(list(value_present))].copy()
#         data.loc[:, 'content'] = data[['subject', 'content']].astype(str).apply(' '.join, axis=1)
#         del data['subject']
#         data = OrderedDict(data.set_index('id')['content'].to_dict())
#         for key in data:
#             data[key] = new_index.preprocess(data[key])
#         if data:
#             bm25 = BM25Okapi(data.values())
#             rankings =  sorted(list(zip(data.keys(), bm25.get_scores(new_index.preprocess(query)))), key=lambda x: x[1], reverse=True)[:10]
#             for rank in rankings:
#                 output.write('{},{}\n'.format(id, rank[0]))
        
#         if (not data) or len(rankings) < 10:
#             if not data:
#                 rankings = []
            
#             logging.debug('Found Some values with no query results or less than 10 {} ranking'.format(id, len(rankings)))
#             print(id, query)
#             for i in range(10 - len(rankings)):
#                 output.write('{},{}\n'.format(id, random.randrange(50000)))
# output.close()

HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))

In [0]:
# !cp submission.csv '/content/drive/My Drive/Homeworks/InformationRetrieval/submission.csv'

### Implementing BM25 when all data on disk

In [0]:
corpus = {
    1: "Hello there good man!",
    2: "It is quite windy in London",
    3: "How is the weather today?"
}
tokenized_corpus = [doc.split(" ") for doc in corpus.values()]
print(tokenized_corpus)
bm25 = BM25Okapi(tokenized_corpus)
query = "windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
print(sorted(list(zip(corpus.keys(),doc_scores)), key=lambda x: x[1], reverse=True))

[['Hello', 'there', 'good', 'man!'], ['It', 'is', 'quite', 'windy', 'in', 'London'], ['How', 'is', 'the', 'weather', 'today?']]
[(2, 0.9372947225064051), (1, 0.0), (3, 0.0)]


In [0]:
output=open('submission.csv', 'w')
with open(queries_list) as queries:
    output.write('QueryId,DocumentId\n')
    for line in tqdm(queries.readlines()):
        line = line.split('\t')
        id = int(line[0])
        query = line[1]
        value_present = boolean_model.find_docs(query)
        logging.debug('Found Value {}'.format(len(value_present)))
        
        data = OrderedDict()

        for doc in value_present:
            with open(os.path.join(db_directory_name, str(doc))) as f:
                data[doc] = new_index.preprocess(f.readlines()[0])
        logging.debug('Loaded Data')
        if data:
            bm25 = BM25Okapi(data.values())
            start = time.time()
            rankings =  sorted(list(zip(data.keys(), bm25.get_scores(new_index.preprocess(query)))), key=lambda x: x[1], reverse=True)[:100]
            for rank in rankings:
                output.write('{},{}\n'.format(id, rank[0]))
            logging.debug('Calculated ranking in {}'.format(time.time()- start))
        
        if (not data) or len(rankings) < 10:
            if not data:
                rankings = []
            
            logging.debug('Found Some values with no query results or less than 10 {} ranking'.format(id, len(rankings)))
            print(id, query)
            for i in range(10 - len(rankings)):
                output.write('{},{}\n'.format(id, random.randrange(50000)))
output.close()

HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))

DEBUG:root:Found Value 4739
DEBUG:root:Loaded Data


TypeError: ignored

In [17]:
data = OrderedDict()
with open(queries_list) as queries:
    for line in tqdm(queries.readlines()):
        line = line.split('\t')
        id = int(line[0])
        query = line[1]
        value_present = boolean_model.find_docs(query)
        data[id] = value_present


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [1]:
i = 0
for k in data:
    print(k, data[k])

    if i == 5:
        break
    i += 1

NameError: ignored

In [20]:
for key in data:
    if len(data[key]) < 10:
        print(key, data[key])

62 set()
111 set()
156 {496233, 210730, 75893}
352 {209731, 14823, 264205, 562898, 90583, 146172, 218014}
442 {33529}
604 {210663}
713 {360690, 127783, 8518, 274151}
924 {452171, 563515}
943 set()
947 {88609, 539908}
1021 set()
1044 {403425, 534403, 162692, 162695, 472585, 440084, 489110, 162742}
1098 {423421}
1099 {347648, 179299, 6278, 430762, 143436}
1272 {234208, 330820, 108110, 581043, 215604, 13560}
1443 set()
1583 {359991, 75893, 259174, 373647}
1616 set()
1646 {573692}
1688 set()
1755 set()
1826 {576075, 136860, 290117}
1991 set()
2226 {256800, 512517, 218366, 466929, 164923, 474942}
2415 {329123, 438313, 289611, 536302, 445470}
2422 {172867, 302524, 570037}
2425 set()
2443 {553540, 553558, 536921, 558458, 258685}
2600 set()
2681 {362760, 4202, 316603, 55118, 209044, 61624, 191195, 303288}
2806 set()
2851 set()
2858 {114644, 17061, 436862, 347991}
2906 {48920, 508507}
2983 {222865, 548819}
3118 {152097, 557508, 12941, 18070, 448735}
3164 {381173}
3263 set()
3304 set()
3327 set(