In [None]:
import pandas as pd
import numpy as np
import re
import string
import time
import math
import pickle
import sys



# returns the dot product of two documents 
def dotProduct(D1, D2): 
    Sum = 0.0
    for key in D1: 
        if key in D2: 
            Sum += (D1[key] * D2[key]) 
    return Sum

# returns the angle in radians 
# between document vectors 
def vector_angle(D1, D2): 
    numerator = dotProduct(D1, D2) 
    denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2)) 
    return math.acos(numerator / denominator) 


def dictionary(query):
    tokens = tokenise(query)
    token_filter = rem_stop_words(tokens)
    stemmed_words = stemmed_tokens(token_filter)
    token_filter1 = rem_stop_words(stemmed_words)
    return token_filter1

def make_vector(query):
    tokens = dictionary(query)
    new_vector = {}
    for token in tokens:
        new_vector[token.lower()] = query.count(token)
    return new_vector

def log_term_frequency(frequency):
    if frequency > 0:
        return (1+math.log(frequency))
    else:
        return 0

def inverse_document(df,N):
    return math.log(N/df)

def weight_tf_idf(tf,df,N):
    return log_term_frequency(tf)*inverse_document(df,N)

def score_sort(score_card,top_10_results):
    flag = 0
    if top_10_results:
        if top_10_results[0][1]<=score_card[1]:
            top_10_results.insert(0,score_card)
        elif top_10_results[-1][1]>=score_card[1]:
            top_10_results.append(score_card)
        else:
            for i in range(len(top_10_results)-1):
                if top_10_results[i][1]>=score_card[1]>=top_10_results[i+1][1]:
                    top_10_results.insert(i+1,score_card)
                    flag = 1
                    break 
    else:
        top_10_results.append(score_card)
    return top_10_results[:10]




def frequency_count(tokens):
    frequency_dictionary={}
    for token in tokens:
        if token not in frequency_dictionary:
            frequency_dictionary[token]=0
        frequency_dictionary[token]+=1
    return frequency_dictionary

def frequency_words(data):
    tokens = []
    for token_list in data.values():
        tokens = tokens + token_list
#     print(len(tokens))
    fdist = frequency_count(tokens)
#     for i in fdist.items():
#         print(i)
#     print(len(fdist.values()))
    return list(fdist.keys())


def inverted_index(preprocessed_data):
    words = frequency_words(preprocessed_data)
    index = {}
    for word in words:
        for doc, tokens in preprocessed_data.items():
            if word in tokens :
                if word in index.keys():
                    index[word].append(doc)
                else:
                    index[word] = [doc]
    return index


def vowel_or_consonant(word,i):
    if word[i] in ["a","e","i","o","u"]:
        return True
    if i-1>0:
        if ((word[i]=="y") and (word[i-1] not in ["a","e","i","o","u"])):
             return True
    return False

def double_consonants(word):
    if len(word)>2:
        if not ((vowel_or_consonant(word,-1) or (vowel_or_consonant(word,-2)))):
                  return True
    return False

def o_form(word):
    if len(word)>=3:
        if ((not (vowel_or_consonant(word,-1) or (vowel_or_consonant(word,-3)))) and(vowel_or_consonant(word,-2))):
            if word[-1] not in ["w","x","y"]:
                  return True
    return False

def check_vc(word):
    required_string=""
    list_vc = []
    for i in range(len(word)):
        if vowel_or_consonant(word,i):
            if i!=0:
                previous = list_vc[-1]
                if previous!='V':
                    list_vc.append('V')
            else:
                list_vc.append('V')
        else:
            if i!=0:
                previous = list_vc[-1]
                if previous !="C":
                    list_vc.append("C")
            else:
                list_vc.append("V")

    for j in list_vc:
        required_string+=j
    return required_string

def word_vowel_check(word):
    for i in range(len(word)):
        if vowel_or_consonant(word,i):
            return True
    return False

def check_mvalue(word):
    vc = check_vc(word)
    return vc.count("VC")

def check_m_0_replace(word,remain,attach):
    x = word.rfind(remain)
    if check_mvalue(word[0:x])>0:
        return word[0:x]+attach
    else:
        return word

def check_m_1_replace(word,remain,attach):
    x = word.rfind(remain)
    if check_mvalue(word[0:x])>1:
        return word[0:x]+attach
    else:
        return word


def step1a(word):
    if word.endswith('sses'):
        word = word.replace('sses','ss')
    elif word.endswith('ies'):
        word = word.replace('ies','i')
    elif word.endswith('ss'):
         word = word.replace('ss','ss')
    elif word.endswith('s'):
        word = word.replace('s', "")
    return word

def step1b(word):
    flag = False

    if word.endswith("eed"):
        if check_mvalue(word[0:-3])>0:
            word = word[0:-3]+"ee"
    elif word.endswith('ed'):
        if word_vowel_check(word[0:-2]):
            word = word[0:-2]
            flag = True
    elif word.endswith("ing"):
        if word_vowel_check(word[0:-3]):
            word = word[0:-3]
            flag = True

    if flag==True:
        if (word.endswith('at') or word.endswith('bl') or word.endswith('iz')):
            word = word + "e"
        elif double_consonants(word) and not word.endswith('l') and not word.endswith('s') and not word.endswith('z'):
            word = word[:-1]
        elif check_mvalue(word)==1 and o_form(word):
            word = word + "e"
    return word

def step1c(word):
    if word.endswith("y"):
        if word_vowel_check(word[0:-1]):
            word = word[0:-1]+"i"
    return word

def step2(word):
    if word.endswith("ational"):
        word = check_m_0_replace(word,"ational","ate")
    elif word.endswith("tional"):
        word = check_m_0_replace(word,"tional","tion")
    elif word.endswith("enci"):
        word = check_m_0_replace(word,"enci","ence")
    elif word.endswith("anci"):
        word = check_m_0_replace(word,"anci","ance")
    elif word.endswith("izer"):
        word = check_m_0_replace(word,"izer","ize")
    elif word.endswith("abli"):
        word = check_m_0_replace(word,"abli","able")
    elif word.endswith("alli"):
        word = check_m_0_replace(word,"alli","al")
    elif word.endswith("entli"):
        word = check_m_0_replace(word,"entli","ent")
    elif word.endswith("eli"):
        word = check_m_0_replace(word,"eli","e")
    elif word.endswith("ousli"):
        word = check_m_0_replace(word,"ousli","ous")
    elif word.endswith("ization"):
        word = check_m_0_replace(word,"ization","ize")
    elif word.endswith("ation"):
        word = check_m_0_replace(word,"ation","ate")
    elif word.endswith("ator"):
        word = check_m_0_replace(word,"ator","ate")
    elif word.endswith("alism"):
        word = check_m_0_replace(word,"alism","al")
    elif word.endswith("iveness"):
        word = check_m_0_replace(word,"iveness","ive")
    elif word.endswith("fulness"):
        word = check_m_0_replace(word,"fulness","ful")
    elif word.endswith("ousness"):
        word = check_m_0_replace(word,"ousness","ous")
    elif word.endswith("aliti"):
        word = check_m_0_replace(word,"aliti","al")
    elif word.endswith("iviti"):
        word = check_m_0_replace(word,"iviti","ive")
    elif word.endswith("biliti"):
        word = check_m_0_replace(word,"biliti","ble")

    return word

def step3(word):
    if word.endswith("icate"):
        word = check_m_0_replace(word,"icate","ic")
    elif word.endswith("ative"):
        word = check_m_0_replace(word,"ative","")
    elif word.endswith("alize"):
        word = check_m_0_replace(word,"alize","al")
    elif word.endswith("iciti"):
        word = check_m_0_replace(word,"iciti","ic")
    elif word.endswith("ful"):
        word = check_m_0_replace(word,"ful","")
    elif word.endswith("ness"):
        word = check_m_0_replace(word,"ness","")
    elif word.endswith("ical"):
        word = check_m_0_replace(word,"ical","ic")
    return word

def step4(word):
    if word.endswith("al"):
        word = check_m_1_replace(word,"al","")
    elif word.endswith("ance"):
        word = check_m_1_replace(word,"ance","")
    elif word.endswith("ence"):
        word = check_m_1_replace(word,"ence","")
    elif word.endswith("er"):
        word = check_m_1_replace(word,"er","")
    elif word.endswith("ic"):
        word = check_m_1_replace(word,"ic","")
    elif word.endswith("able"):
        word = check_m_1_replace(word,"able","")
    elif word.endswith("ible"):
        word = check_m_1_replace(word,"ible","")
    elif word.endswith("ant"):
        word = check_m_1_replace(word,"ant","")
    elif word.endswith("ement"):
        word = check_m_1_replace(word,"ement","")
    elif word.endswith("ment"):
        word = check_m_1_replace(word,"ment","")
    elif word.endswith("ent"):
        word = check_m_1_replace(word,"ent","")
    elif word.endswith("ou"):
        word = check_m_1_replace(word,"ou","")
    elif word.endswith("ism"):
        word = check_m_1_replace(word,"ism","")
    elif word.endswith("ate"):
        word = check_m_1_replace(word,"ate","")
    elif word.endswith("iti"):
        word = check_m_1_replace(word,"iti","")
    elif word.endswith("ous"):
        word = check_m_1_replace(word,"ous","")
    elif word.endswith("ive"):
        word = check_m_1_replace(word,"ive","")
    elif word.endswith("ize"):
        word = check_m_1_replace(word,"ize","")
    elif word.endswith("ion"):
        if check_mvalue(word[0:-3])>1 and ((word[0:-3]).endswith("s") or (word[0:-3]).endswith("t")):
            word = word[0:-3]
    return word

def step5a(word):
    if word.endswith("e"):
        if check_mvalue(word[0:-1])>1:
            word = word[0:-1]
        elif check_mvalue(word[0:-1])==1 and not o_form(word[0:-1]):
            word = word[0:-1]
    return word

def step5b(word):
    if check_mvalue(word)>1 and double_consonants(word) and word.endswith("l"):
        word = word[0:-1]
    return word



def stemming(word):
        word = step1a(word)
        word = step1b(word)
        word = step1c(word)
        word = step2(word)
        word = step3(word)
        word = step4(word)
        word = step5a(word)
        word = step5b(word)
        return word



def stemmed_tokens(token_filter):
#     creating stemming without using nltk
    stemmed_words = [stemming(token) for token in token_filter]
    return stemmed_words


def rem_stop_words(tokens):
#     remoing the stop words
    stop_words=['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there',
                'about','once','during','out','very','having','with', 'they', 'own',
                'an', 'be', 'some', 'for', 'do', 'its','yours','such','into','of','most',
                'itself','other','off','is','s','am','or','who','as','from','him','each',
                'the','themselves','until','below','are','we','these','your','his','through',
                'don','nor','me','were','her','more','himself','this', 'down','should',
                'our','their','while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she',
                'all','no','when','at','any','before','them','same','and','been','have',
                'in','will','on','does','yourselves','then','that','because','what','over',
                'why', 'so','can','did', 'not','now', 'under','he','you','herself','has',
                'just','where','too','only','myself','which','those','i','after', 'few',
                'whom', 't', 'being','if', 'theirs', 'my','against','a','by','doing', 'it',
                'how', 'further', 'was', 'here', 'than']
    token_filter = [token for token in tokens if token not in stop_words and len(token) > 2]
    return token_filter

def tokenise(content):
#     tokenisation like removing punchuation and other etc.
    remove_punctuation = str.maketrans("","",string.punctuation)
    modified_data = content.translate(remove_punctuation)
    modified_data = ''.join([i for i in modified_data if not i.isdigit()])
    x=modified_data.strip()
    z=x.split()
    return z


def preprocess_data(data):
    term_dictionary = {}
    for i in range(len(data)):
        tokens = tokenise(data.loc[i]["headlines"])
        token_filter = rem_stop_words(tokens)
        stemmed_words = stemmed_tokens(token_filter)
        token_filter1 = rem_stop_words(stemmed_words)
        term_dictionary[i]=token_filter1
    return term_dictionary

if __name__=="__main__":
    
        news_summary_more_ds = pd.read_csv("./news_summary_more.csv")
# -------------------loading and storing the created dictionary in pickle file---------------------------------------
#     preprocessed_data = preprocess_data(news_summary_more_ds)
#     invertedindex = inverted_index(preprocessed_data)
#     pickle_out = open("dict.pickle","wb")
#     pickle.dump(invertedindex, pickle_out)
#     pickle_out.close()
# ---------------------------------------------------------------------------------------------------------------------

# ----------------------loading the inverted index using pickle that is created by using above steps-------------------
        print("Which task do you want to execute\n")
        print("1)To get the top 10 relevant documents for your given query\n")
        print("2)To get the  5 suggestions for the given query\n")
        print("3)Quit")
        p = int(input())
        
        pickle_in = open("dict.pickle","rb")
        invertedindex = pickle.load(pickle_in)
        
        Total_documents = 98401
        suggestions = []
        
        while p!=0:
            top_10_results = []
            query = input("-----------Please Enter Your Phrasal Query that you want here--------------\n")
            query_vector = make_vector(query)
            print("The query in vector form is {}\n".format(query_vector))
            if p==1:
                print('.........................TASK 1...........relevant documents................wait....for.....a...few..seconds......\n')
                for i in range(len(news_summary_more_ds)):
                    documents_list = news_summary_more_ds.loc[i]['headlines'] 
                    doc_vector = make_vector(documents_list)
                    search_tokens = set(query_vector.keys()).intersection(set(doc_vector.keys()))
                    if search_tokens:
                        score = 0
                    for token in search_tokens:
                        score += weight_tf_idf(query_vector[token],len(invertedindex[token]),Total_documents)*weight_tf_idf(doc_vector[token],len(invertedindex[token]),Total_documents)
                        rownumber = i
                        score_card = [rownumber,score]
                        top_10_results = score_sort(score_card,top_10_results)
        
        
                if top_10_results:
                    for i in range(len(top_10_results)):
                        x = top_10_results[i][0]
                        print('-----------------------------------------------------------------------------')
                        print(news_summary_more_ds.loc[x])
                        print('==============================================================================\n')
                else:
                    print('No Search Results Found')
                    
                suggestions.append(query)
                print("Which task do you want to execute\n")
                print("1)To get the top 10 relevant documents for your given query\n")
                print("2)To get the  5 suggestions for the given query\n")
                print("3)Quit")
                p = int(input())
            elif p==2:
                l = []
#                 print(suggestions)
                if len(suggestions)>0:
                    for i in suggestions:
                        z = make_vector(i)
                        angle = dotProduct(z,query_vector)
                        if angle > 0:
                            l.append(i)
            
                for i in range(len(news_summary_more_ds)):
                    documents_list = news_summary_more_ds.loc[i]['headlines'] 
                    doc_vector = make_vector(documents_list)
                    search_tokens = set(query_vector.keys()).intersection(set(doc_vector.keys()))
                    if search_tokens:
                        score = 0
                    for token in search_tokens:
                        score += weight_tf_idf(query_vector[token],len(invertedindex[token]),Total_documents)*weight_tf_idf(doc_vector[token],len(invertedindex[token]),Total_documents)
                        rownumber = i
                        score_card = [rownumber,score]
                        top_10_results = score_sort(score_card,top_10_results)
                        
                print("----------The suggestions for our query are---------------\n")
                if len(l)>0:
                    for i in l:
                        print(i)
                if top_10_results:
                    for i in range(len(top_10_results)):
                        x = top_10_results[i][0]
                        print(news_summary_more_ds.loc[x]['headlines'])
                        print("\n")
                elif len(l)==0:
                    print('No Search Results Found\n')
                
                print("Which task do you want to execute\n")
                print("1)To get the top 10 relevant documents for your given query\n")
                print("2)To get the  suggestions for the given query\n")
                print("3)Quit")
                p = int(input())
            else:
                p=0
                
        print("..........you are quit from the search session............")
            
            

Which task do you want to execute

1)To get the top 10 relevant documents for your given query

2)To get the  5 suggestions for the given query

3)Quit
1
-----------Please Enter Your Phrasal Query that you want here--------------
upgrad is awesome
The query in vector form is {'upgrad': 1, 'awesom': 1}

.........................TASK 1...........relevant documents................wait....for.....a...few..seconds......

-----------------------------------------------------------------------------
headlines    Astronaut shares 'burrito of awesomeness' view...
text         American astronaut Jack David Fischer, who is ...
Name: 77887, dtype: object

-----------------------------------------------------------------------------
headlines    Priyanka is super awesome: Rumoured boyfriend ...
text         Priyanka Chopra's rumoured boyfriend, American...
Name: 38265, dtype: object

-----------------------------------------------------------------------------
headlines    Virat Kohli is awesome, s