In [94]:
import numpy as np
import nltk
import shelve
import copy
from nltk.corpus import wordnet as wn
from collections import Counter
import math
import operator

In [41]:
!pip install tqdm



In [20]:
with shelve.open('index','c') as index:
    index.sync()

In [21]:
index = shelve.open('index', writeback = True)
len(index)

0

In [134]:
with shelve.open('example', 'c') as example:
    example['x'] = [1,2,3]
    example['y'] = [4,5,6]
    example['z'] = [7,8,9]
    example['w'] = [10,11,12]
    example.sync()

ex = shelve.open('example', writeback=True)
for i in ex:
    ex[i].append(120)
    ex.sync()
for i in ex:
    print(i, ex[i])

x [1, 2, 3, 120]
z [7, 8, 9, 120]
y [4, 5, 6, 120]
w [10, 11, 12, 120]


In [96]:
from nltk.corpus import gutenberg
len(gutenberg.fileids())


18

In [97]:
#unigrams
unwanted = set(nltk.corpus.stopwords.words("english"))
unwanted.update(list('!"#$%&\'()*+,-./:;<=>? @[\\]`^_`{|}~£'))
unwanted.update(["--", "'s", "mr", "mrs", "''", "``", "", "'t",])

def get_rid_of_underscores_and_dots_and_upper(word):
    if word[0]=="_" and word[-1]=="_":
        return word[1:-1].lower()
    elif word[-1]==".":
        return word[:-1].lower()
    elif word[0]=="`":
        return word[1:]
    else:
        return word.lower()

def tokenise_and_clean(corpus):
    doc_to_tokens = {}
    for document_id in corpus.fileids():
        tokens = nltk.word_tokenize(corpus.raw(document_id))
        clean_tokens = [get_rid_of_underscores_and_dots_and_upper(t) for t in tokens]
        clean_tokens = filter(lambda x: x not in unwanted, clean_tokens)
        doc_to_tokens[document_id] = list(clean_tokens)
    return doc_to_tokens

docs_to_tokens = tokenise_and_clean(gutenberg)


In [98]:
#bigrams

def bigramise_and_clean(corpus):
    doc_to_bigrams = {}
    for document_id in corpus.fileids():
        words = corpus.words(document_id)
        clean_words = [get_rid_of_underscores_and_dots_and_upper(w) for w in words]
        clean_words = filter(lambda x: x not in unwanted, clean_words)
        doc_to_bigrams[document_id] = list(nltk.bigrams(clean_words))
    return doc_to_bigrams

docs_to_bigrams = bigramise_and_clean(gutenberg)

In [99]:
docs_to_bigrams["austen-emma.txt"][:10]

[('emma', 'jane'),
 ('jane', 'austen'),
 ('austen', '1816'),
 ('1816', 'volume'),
 ('volume', 'chapter'),
 ('chapter', 'emma'),
 ('emma', 'woodhouse'),
 ('woodhouse', 'handsome'),
 ('handsome', 'clever'),
 ('clever', 'rich')]

In [100]:
from tqdm import tqdm
index_dictionary = {}
index_bigrams = {}
def index_files(index_filename, documents_to_tokens):
    for document_id in tqdm(documents_to_tokens.keys()):
        frequency_of_all_tokens = Counter(documents_to_tokens[document_id])
        document_len = len(documents_to_tokens[document_id])
        for token in documents_to_tokens[document_id]:
            term_frequency_per_document = (frequency_of_all_tokens[token])/(document_len)
            if token not in index_filename:
                index_filename[token] = []
            if (document_id, term_frequency_per_document) not in index_filename[token]:    
                index_filename[token].append((document_id, term_frequency_per_document))

index_files(index_dictionary, docs_to_tokens)
print(index_dictionary['pretty'])
index_files(index_bigrams, docs_to_bigrams)


100%|██████████| 18/18 [00:01<00:00,  9.01it/s]
  6%|▌         | 1/18 [00:00<00:02,  7.53it/s]

[('austen-emma.txt', 0.0009302456694245162), ('austen-persuasion.txt', 0.0007447402718301992), ('austen-sense.txt', 0.0006775705332103669), ('blake-poems.txt', 0.002103049421661409), ('bryant-stories.txt', 0.0013802622498274672), ('burgess-busterbrown.txt', 0.0008984725965858042), ('carroll-alice.txt', 7.439369141496801e-05), ('chesterton-ball.txt', 0.00025145213608589607), ('chesterton-brown.txt', 0.0004815318377520961), ('chesterton-thursday.txt', 0.00017682215227923754), ('edgeworth-parents.txt', 0.0006108890981749688), ('melville-moby_dick.txt', 0.0002664021018207206), ('shakespeare-hamlet.txt', 0.00012543116964565694), ('shakespeare-macbeth.txt', 0.0002947244326554671), ('whitman-leaves.txt', 1.5311121999020088e-05)]


100%|██████████| 18/18 [00:04<00:00,  5.57it/s]


In [101]:
n_documents = len(gutenberg.fileids())
#index in form {'keyword': (document_id, term_frequency)} and n_documents as number of all documents in the corpus
def rank_tf_idf(index, n_document):
    ranked_index = {}
    for word in index:
        docs_with_word = len(index[word])
        ranked_index[word] = []
        ranks = [] #for sorting
        for pair in index[word]:
            doc_id = pair[0]
            doc_freq = 1+math.log(n_document/docs_with_word, 2)
            rank = (doc_id, doc_freq*pair[1])
            ranks.append(rank)
        ranked_index[word].append(sorted(ranks, key=operator.itemgetter(1), reverse=True))
    return ranked_index

ranked = rank_tf_idf(index_dictionary, len(gutenberg.fileids()))
ranked_bigrams = rank_tf_idf(index_bigrams, len(gutenberg.fileids()))


In [104]:
ranked_bigrams[('pretty', 'girl')]

[[('austen-sense.txt', 0.00032264985156341965),
  ('austen-persuasion.txt', 0.0002779184325330371),
  ('austen-emma.txt', 9.528646042902362e-05)]]

In [105]:
#takes query_tfidfs as {'keyword': tfidf} and doc_per_word as {'keyword': tfidf} in one document
def cosine_similarity(query_tfidfs, tfidf_per_word):
    query_sum = 0
    doc_sum = 0
    dot_product = 0
    for query_word in query_tfidfs:
        query_sum += (query_tfidfs[query_word]**2)
        doc_sum += (tfidf_per_word[query_word]**2)
        dot_product += (query_tfidfs[query_word] * tfidf_per_word[query_word])
    query_norm = math.sqrt(query_sum)
    doc_norm = math.sqrt(doc_sum)
    if query_norm * doc_norm == 0:
        return 0
    else:
        return dot_product/(query_norm * doc_norm)

In [106]:
def get_tfidf_for_document_id(document_id, subset_from_index):
    tfidfs_per_word = {}
    for word in subset_from_index:
        for pair in subset_from_index[word]:
            if pair[0] == document_id:
                tfidfs_per_word[word] = pair[1]
        #in case some query words are not present in the document
        if word not in tfidfs_per_word:
            tfidfs_per_word[word] = 0
    
    return tfidfs_per_word

In [198]:
#takes result as a tuple of unigram_index search and bigram_index search, both as [(document_id, rank), ..]
def find_example(results, query_unigrams, query_bigrams, corpus):
    substring_window = 20 #to show the query with this many symbols on both sides
    top_uni = results[0]
    top_bi = results[1]
    result_counter = 0 #number of results already shown
    printed_docs = [] #to store docs we've already shown
    
    for pair in top_bi:
        if pair[1] > 0:
            print(pair[0].upper())
            result_counter += 1
            printed_docs.append(pair[0])
            for bigram in query_bigrams:
                try:
                    bg = " ".join(list(bigram))
                    occurence_index = corpus.raw(pair[0]).lower().index(bg)
                    if occurence_index < substring_window:
                        print(corpus.raw(pair[0])[:occurence_index+substring_window]+"...\n")
                    else:
                        print("..."+corpus.raw(pair[0])[occurence_index-substring_window:occurence_index+substring_window]+"...\n")
                except ValueError:
                    continue
    #if there aren't enough results, we can go to the unigram index and supply more documents with parts of the query
    # '4' is an arbitrary number, seemed most suitable
    if result_counter < 4:
        for pair in top_uni:
            if pair[1] > 0 and pair[0] not in printed_docs and result_counter < 8:
                print(pair[0].upper())
                result_counter += 1
                printed_docs.append(pair[0])
                for unigram in query_unigrams:
                    try:
                        occurence_index = corpus.raw(pair[0]).lower().index(unigram)
                        if occurence_index < substring_window:
                            print(corpus.raw(pair[0])[:occurence_index+substring_window]+"...\n")
                        else:
                            print("..."+corpus.raw(pair[0])[occurence_index-substring_window:occurence_index+substring_window]+"...\n")
                    except ValueError:
                        pass
            

In [199]:
corpus = gutenberg
def search(query, corpus):
    #dividing query into unigrams and bigrams
    q_tokens = nltk.word_tokenize(query)
    clean_q_tokens = [get_rid_of_underscores_and_dots_and_upper(t) for t in q_tokens]
    bg_clean_q_tokens = clean_q_tokens
    
    clean_q_tokens = filter(lambda x: x not in unwanted, clean_q_tokens)
    bg_clean_q_tokens = filter(lambda x: x not in unwanted, bg_clean_q_tokens)
    
    query_bigrams = list(nltk.bigrams(bg_clean_q_tokens))
    
    list_of_tokens = list(clean_q_tokens)
    query_length = len(list_of_tokens)
    
    query_bg_length = len(query_bigrams)
    
    #calculating tf-idfs for query terms in unigrams
    frequencies = Counter(list_of_tokens)
    frequency_of_query_terms = {}
    ranked_docs_per_term = {}
    for term in frequencies:
        if term in index_dictionary:
            frequency_of_query_terms[term] = frequencies[term]/query_length
            term_idf = 1+math.log(n_documents/len(index_dictionary[term]), 2)
            frequency_of_query_terms[term] *= term_idf
            ranked_docs_per_term[term] = index_dictionary[term]
            
    #calculating tf-idfs for query bigrams
    bg_frequencies = Counter(query_bigrams)
    bg_frequency_of_query_terms = {}
    bg_ranked_docs_per_term = {}
    for term in bg_frequencies:
        if term in index_bigrams:
            bg_frequency_of_query_terms[term] = bg_frequencies[term]/query_bg_length
            term_idf = 1+math.log(n_documents/len(index_bigrams[term]), 2)
            bg_frequency_of_query_terms[term] *= term_idf
            bg_ranked_docs_per_term[term] = index_bigrams[term]
    
    #if no query words exist in our index
    if len(ranked_docs_per_term) == 0 and len(bg_ranked_docs_per_term) == 0:
        print("Please try another key phrase, we can't seem to find this one, sorry...\n")
        return
    
    #comparing similarities of query and docs vectors
    ranked_docs_per_query = {}
    bg_ranked_docs_per_query = {}
    for document_id in corpus.fileids():
        tfidfs_per_word = get_tfidf_for_document_id(document_id, ranked_docs_per_term)
        bg_tfidfs_per_word = get_tfidf_for_document_id(document_id, bg_ranked_docs_per_term)
        cs = cosine_similarity(frequency_of_query_terms, tfidfs_per_word)
        bg_cs = cosine_similarity(bg_frequency_of_query_terms, bg_tfidfs_per_word)
        if cs != 0 or bg_cs != 0:
            ranked_docs_per_query[document_id] = cs
            bg_ranked_docs_per_query[document_id] = bg_cs
    
    sorted_ranked_docs = sorted(ranked_docs_per_query.items(), key=operator.itemgetter(1), reverse=True)
    bg_sorted_ranked_docs = sorted(bg_ranked_docs_per_query.items(), key=operator.itemgetter(1), reverse=True)
    results = (sorted_ranked_docs, bg_sorted_ranked_docs)
    print(results)
    find_example(results, list_of_tokens, query_bigrams, corpus)
    
query = input("What would you like to search for?\n")        
results = search(query, gutenberg)
results

What would you like to search for?
Moby Dick is a great whale with fangs
([('melville-moby_dick.txt', 0.3941763281569343), ('blake-poems.txt', 0.29036753123472075), ('bryant-stories.txt', 0.1583008477092572), ('shakespeare-hamlet.txt', 0.14707595038229476), ('austen-persuasion.txt', 0.13549352295722214), ('chesterton-ball.txt', 0.12412064344595966), ('whitman-leaves.txt', 0.11811958663761601), ('bible-kjv.txt', 0.11719189749544677), ('shakespeare-macbeth.txt', 0.11625530846729783), ('austen-emma.txt', 0.11625530846729781), ('carroll-alice.txt', 0.11625530846729781), ('chesterton-brown.txt', 0.11625530846729781), ('edgeworth-parents.txt', 0.11625530846729781), ('shakespeare-caesar.txt', 0.11625530846729781), ('austen-sense.txt', 0.1162553084672978), ('burgess-busterbrown.txt', 0.1162553084672978), ('chesterton-thursday.txt', 0.1162553084672978), ('milton-paradise.txt', 0.1162553084672978)], [('melville-moby_dick.txt', 0.7784130394753296), ('austen-emma.txt', 0), ('austen-persuasion.txt'

In [144]:
ind = gutenberg.raw('melville-moby_dick.txt').index('Moby Dick')
gutenberg.raw('melville-moby_dick.txt')[:ind+30]

'[Moby Dick by Herman Melville 1'

In [93]:
'''def search_old(keywords):
    k_tokens = nltk.word_tokenize(keywords)
    clean_k_tokens = [get_rid_of_underscores_and_dots_and_upper(t) for t in k_tokens]
    clean_k_tokens = filter(lambda x: x not in unwanted, clean_k_tokens)
    
    docs_per_word = {}
    only_docs_per_word = []
    for word in list(clean_k_tokens):
        if word in index_dictionary:
            docs_per_word[word] = index_dictionary[word] #get document's name and tf-idf
            this_words_docs = map(operator.itemgetter(0), index_dictionary[word]) #get only document's name
            only_docs_per_word.append(set(this_words_docs)) 
    common_docs = set.intersection(*only_docs_per_word)
    
    all_docs = [item for sublist in docs_per_word.values() for item in sublist]
    final_doclist_for_query = []
    for item in all_docs:
        if item[0] in common_docs:
            final_doclist_for_query.append(item)
    return final_doclist_for_query

search("crazy baby")'''

'def search_old(keywords):\n    k_tokens = nltk.word_tokenize(keywords)\n    clean_k_tokens = [get_rid_of_underscores_and_dots_and_upper(t) for t in k_tokens]\n    clean_k_tokens = filter(lambda x: x not in unwanted, clean_k_tokens)\n    \n    docs_per_word = {}\n    only_docs_per_word = []\n    for word in list(clean_k_tokens):\n        if word in index_dictionary:\n            docs_per_word[word] = index_dictionary[word] #get document\'s name and tf-idf\n            this_words_docs = map(operator.itemgetter(0), index_dictionary[word]) #get only document\'s name\n            only_docs_per_word.append(set(this_words_docs)) \n    common_docs = set.intersection(*only_docs_per_word)\n    \n    all_docs = [item for sublist in docs_per_word.values() for item in sublist]\n    final_doclist_for_query = []\n    for item in all_docs:\n        if item[0] in common_docs:\n            final_doclist_for_query.append(item)\n    return final_doclist_for_query\n\nsearch("crazy baby")'