In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
import wordninja 

####### After importing nltk, run the following only once ######
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')
### pip install wordninja ###

In [2]:
def remove_htmlcodes (document):
    
    '''Removes HTML entity codes such as &amp from document and returns the clean document'''
    
    replacement = {
                    "&ampnbsp": ' ',
                    "&ampamp": '&',
                    "&ampquot": '\'',
                    "&ampldquo": '\"',
                    "&amprdquo": '\"',
                    "&amplsquo": '\'',
                    "&amprsquo": '\'',
                    "&amphellip": '...',
                    "&ampndash": '-',
                    "&ampmdash": '-'
                  }
    
    for str in replacement:
        document = document.replace(str, replacement[str])
        
    return document

In [3]:
def get_wordnet_pos (word):
    
    '''Returns the tag of usage of word depending on context'''
    
    tag=nltk.pos_tag([word])[0][1][0].upper()
    tag_dict={"J": wordnet.ADJ, 
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)

def lemma_stop (str):
    
    '''Returns the lemmatized document after tokenization and stop word removal'''
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer('\w+|\$]\d\[+|\S+,-')
    tokenized = tokenizer.tokenize(str)
    lemmatized = [lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in tokenized]
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in lemmatized if w.lower() not in stop_words]
    after_lemma_stop = ' '.join(w for w in filtered_sentence)
    
    return filtered_sentence

In [4]:
def is_not_credible (text):
    
    '''Returns true if text has no special characters, else returns false'''
    
    match = re.search(r'[!@#?&{}()]', text)
    
    if match:
        return TrueF
    else:
        return False

In [5]:
def scrub_words (text):
    
    '''Removes special characters from text and returns a clean string'''
    
    text = re.sub('[!@#?&{}()]', '', text)
    text=re.sub(r'[^\x00-\x7F]'," ",text)
    return text

In [6]:
def clean_document (document_string):
    
    '''Cleans document_string by splitting very long strings and identifying garbage JSON and HTML and discarding'''
    
    cleaned_doc = document_string
    for word in document_string.split():
                if is_not_credible(word):
                    temp= scrub_words(word)
                    split=wordninja.split(temp)
                    if len(split)>7:
                          cleaned_doc = cleaned_doc.replace(word,'')
                    else:
                        replace_with=' '.join(word for word in split)
                        cleaned_doc = cleaned_doc.replace(word, replace_with)
    return cleaned_doc

In [7]:
from datetime import datetime

count_dates = []

def replace_dates(documentString, docID):
    
    '''Replaces dates of the format MM/DD and MM/DD/YYYY with DDmmmYYYY inside documentString'''
    
    regEx = '(([0-9]+(/)[0-9]+(/)[0-9]+)|([0-9]+(/)[0-9]+))'
    iterator = re.finditer(regEx, documentString)
    listOfDates = [(m.start(0), m.end(0)) for m in iterator]
    tmp = []
    replace_with = []
    for indices in listOfDates:
        date = documentString[indices[0]:indices[1]]
        tmp.append(date)
        count = date.count('/')
        newDate = ''
        if count == 2:
            check_year = date[-3]
            
            if check_year == '/':
                YY = date[-2:]
                
                if int(YY) <= 19:
                    proper_date = date[:-2] + '20' + YY
                    date = date.replace(date,proper_date)
                else:
                    proper_date = date[:-2] + '19' + YY
                    date = date.replace(YY,('19'+YY))
                    
            try:
                newDate = datetime.strptime(date, '%m/%d/%Y').strftime('%d %b %Y')
            except ValueError as ve:
                newDate = date
        else:
            try:
                newDate = datetime.strptime(date, '%m/%d').strftime('%d %b')
            except ValueError as ve:
                newDate = date
                
        count_dates.append([docID, date])
        newDate = newDate.replace(' ', '')
        replace_with.append(newDate)
        
    for i in range(len(tmp)):
        documentString = documentString.replace(tmp[i], replace_with[i])
    
    return documentString

In [8]:
# Reading persistent files

import pickle
import trie

get_docID = {}
get_index = {}

data = np.load("datan.npy", allow_pickle = True)

print(len(data))

for i in range(0, len(data)) :
    get_docID[i] = data[i][0]
    get_index[data[i][0]] = i
collection = None
documentRoot = {}
max_tf = {}

with open('collection.pickle', 'rb') as handle:
    collection = pickle.load(handle)
with open('documentRoot.pickle', 'rb') as handle:
    documentRoot = pickle.load(handle)
with open('max_tf.pickle', 'rb') as handle:
    max_tf = pickle.load(handle)

148


In [9]:
# Processing query

import unidecode

query = "laptop india"
final_query = replace_dates(query, -1)
final_query = lemma_stop(final_query)

for i in range(len(final_query)):
    final_query[i] = unidecode.unidecode(final_query[i])
    # case-folding
    final_query[i] = final_query[i].lower()
print(final_query)

tf_query = {}
for w in final_query:
    if w not in tf_query:
        tf_query[w] = 1
    else:
        tf_query[w] += 1

['laptop', 'india']


***Ranked Retrieval based on TF-IDF Score :***


In [10]:
import queue

# scores[i] stores the dot product of the tf-idf score vectors of the query and document of docID i in the corpus
scores = {}
title_score = {}

# N is the total number of documents in the corpus
N = len(documentRoot)

# wordsInDoc[i] is a sorted list of (word, score) tuples where
# score is the tf-idf score for the (word, <ith doc>) pair
wordsInDoc = {}

factor = {}

import math
import bisect

for query_term in tf_query:
    
    docs_having_query_term = collection.get_doc_list(query_term, 0)
    df = len(docs_having_query_term)
    idf = 0
    
    print('-------------------------------------')
    print('Term in query = ', query_term)
    print()
    
    if df == 0:
        idf = 0
    else:
        idf = math.log10(N/df)
        
    docs_having_query_term_in_title = collection.get_title_list(query_term,0)
    
    for docID in docs_having_query_term_in_title:
        if docID in title_score:
            title_score[docID] += idf
        else:
            title_score[docID] = idf
        
    print('df = ',df)
    print('idf = ',idf)
    
    tfidf_query = tf_query[query_term] * idf
        
    for docID in docs_having_query_term:
        
        tf_doc = documentRoot[docID].count_words(query_term, 0)
        tf_doc = 0.5 + 0.5*tf_doc/max_tf[docID]
        tfidf_doc = (tf_doc)
        
        if docID not in scores:
            scores[docID] = (tfidf_query * tfidf_doc)
            wordsInDoc[docID] = []
            bisect.insort(wordsInDoc[docID], [-tfidf_query * tfidf_doc, query_term])
            factor[docID] = idf
        else:
            scores[docID] += (tfidf_query * tfidf_doc)
            bisect.insort(wordsInDoc[docID], [-tfidf_query * tfidf_doc, query_term])
            factor[docID] += idf
            
# print(title_score)

for docID in scores:
    
    #if documentLength[docID] != 0:
    scores[docID] *= factor[docID]
    if docID in title_score:
        scores[docID] *= 1 + title_score[docID]

sorted_scores = sorted(scores.items(), key = lambda kv : kv[1] , reverse = True)

maxshow = min(10, len(scores))
print('\n\n')
print('============================================')

for i in range(maxshow):
    
    print()
    docID = sorted_scores[i][0]
    print('doc ID = ', docID)
    cnt = 0
    print('Keywords:')
    print()
    print(data[get_index[sorted_scores[i][0]]][2])
    print()
    if sorted_scores[i][0] not in title_score:
        print('title score = ',0)
    else:
        print('title score = ',title_score[sorted_scores[i][0]])
    for j in range(len(wordsInDoc[docID])):
        print(wordsInDoc[docID][j][1], wordsInDoc[docID][j][0], end = ' ')
        print(documentRoot[docID].count_words(wordsInDoc[docID][j][1], 0))
    print()
    print()
    count = 0
    found = 0
    words_before=queue.Queue()
    at_start = 1
    display = ""
    
    for word in data[get_index[docID]][4].split():
            
        check_with=replace_dates(word, -1)
        check_with = check_with.lower()
        if len(lemma_stop(check_with)) > 0:
            check_with=lemma_stop(check_with)[0]
        else:
            check_with=word
        
        if check_with == wordsInDoc[docID][0][1]:
            found=1
            
        if found == 1:
            display = display + word + " "
            count += 1
            if count == 50:
                break
        if found == 0:
            words_before.put(word)
            if words_before.qsize()>20:
                remove=words_before.get()
                at_start=0
                
    if not at_start:
        print('...', end = ' ')
    while words_before.qsize() > 0:
        print(words_before.get(), end = ' ')
    print(display, end = ' ')
    print('...', end = ' ')
    print('\n')
    print('tf-idf score=', sorted_scores[i][1])
    print('\n')
    print('============================================')
#print(sorted_scores)
dates = []
from collections import Counter

for i in range(len(scores)):
    split_l = re.split(",|\s",data[get_index[sorted_scores[i][0]]][5])
    s = split_l[0] +" "+ split_l[1] + " " + split_l[3]
    #print(s)
    dates.append(s)
    #print(dates)
    
#dates.sort(key = lambda date: datetime.strptime(date, '%B %d %Y')) 
#print(dates)
count1 = 0
for keys in Counter(dates).most_common():
    print(keys[0],keys[1])
    count += 1
    if count == 10:
        break

-------------------------------------
Term in query =  laptop

df =  3
idf =  1.693140460675295
-------------------------------------
Term in query =  india

df =  96
idf =  0.18799048235538898




doc ID =  1605443331-390
Keywords:

Lenovo Yoga Slim 7i Carbon With 11th-Gen Intel Core Processors, QHD Display Launched

title score =  0
laptop -1.5520454222856872 10
india -0.10966111470731024 2


Lenovo Yoga Slim 7i Carbon has been launched as the latest model in the company's Yoga series. The new laptop comes with a lightweight, 966 grams, chassis made of an aero-grade carbon fibre material that is touted to meet MIL-STD-810G standards and is designed to withstand knocks and bumps. The Yoga Slim 7i Carbon also comes Intel's Evo badge that is meant for premium ultraportable laptops to highlight their  ... 

tf-idf score= 3.1258875849738894



doc ID =  1605443754-455
Keywords:

Huawei Said to Be in Talks to Sell Parts of Its Honor Smartphone Business

title score =  0
laptop -0.907039532

IndexError: index 5 is out of bounds for axis 0 with size 5