In [8]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import copy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

[nltk_data] Downloading package stopwords to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
data_dir = 'data/Humor,Hist,Media,Food'
file_names = os.listdir(data_dir) #reading the data directory to list all the files
file_paths = [(data_dir + '/' + fname) for fname in file_names] #forming file paths
docID_to_doc_mapping = {} #forming docID to doc name mapping
for i in range(len(file_names)):
    docID_to_doc_mapping[i] = file_names[i]

In [10]:
def remove_punct(tok):
    '''
        Removing punctations from tokens
    '''
    punctuations = string.punctuation
    tok = ''.join(x for x in tok if x not in punctuations)
    return tok
def remove_space(tok):
    '''
        Removing blank space toks
    '''
    tok = ''.join(x for x in tok if x != ' ')
    return tok

def preprocess_file(file_text):
    '''
        This function preprocesses the file text.
        Input: file_text in string form represting the text of a file
        Returns: cleaned_toks, word tokens present in the file after preprocessing
    '''

    #converting the text to lowercase
    ftext = file_text.lower()

    #performing word tokenization
    file_toks = word_tokenize(ftext)

    #removing the stopwords from tokens
    stop_words = list(set(stopwords.words('english')))
    file_toks = [tok for tok in file_toks if tok not in stop_words]

    #removing punctuation marks from tokens
    toks_no_punct = []
    for tok in file_toks:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)
    
    #Removing blank space tokens
    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)

    return cleaned_toks

def cleanQuery(query_text):
    '''
        Preprocessing the query text
        Input: query_text, string of the phrase query text
        Returns: cleaned_toks, an array containg the preprocessed query tokens
    '''

    #We perform the same preprocessing steps on the query as we did for the file text

    #converting the text to lowercase
    qtext = query_text.lower()
    
    #performing word tokenization
    query_toks = word_tokenize(qtext)
    
    #removing the stopwords from tokens
    stop_words = list(set(stopwords.words('english')))
    query_toks = [tok for tok in query_toks if tok not in stop_words]
    
    #removing punctuation marks from tokens
    toks_no_punct = []
    for tok in query_toks:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)
    
    #Removing blank space tokens
    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)
    
    return cleaned_toks

In [11]:
def read_file(fpaths):
    '''
        Reads the files and preprocess every file's text to form word tokens for every file.
        Returns a 2-D list containing word tokens for every file
    '''
    file_tokens = []
    for fpath in fpaths:
        f = open(fpath, 'r', encoding='utf-8', errors='ignore') #open the file
        ftxt_unprocessed = f.read() #read the text of the file
        ftoks = preprocess_file(ftxt_unprocessed) #preprocessing the text to form word tokens
        file_tokens.append(ftoks)
    return file_tokens


In [12]:
def getDocsFromID(docID_to_doc, doc_IDs):
    '''
        Given a list of document IDs, it outputs the document names corresponding to thos IDs.
        Input: docID_to_docs (mapping between docID -> doc_name), docIDs - list of input document IDs
        Returns: doc_names - list of doc_names corresponding to document IDs in doc_IDs
    '''
    doc_names = []
    for doc_ID in doc_IDs:
        doc_names.append(docID_to_doc[doc_ID])
    return doc_names

In [13]:
document_toks = read_file(file_paths)

In [14]:
vocabulary_set = set()
for doc_tok in document_toks:
    for tok in doc_tok:
        vocabulary_set.add(tok)
vocabulary_list = list(vocabulary_set)
id_to_term = {}
term_to_id = {}
for i in range(len(vocabulary_list)):
    id_to_term[i] = vocabulary_list[i]
    term_to_id[vocabulary_list[i]] = i

In [15]:
def compute_jaccard_coeff(query_toks, doc_toks):
    query_tok_set = set(query_toks)
    doc_toks_set = set(doc_toks)
    num_intersection = len(list(query_tok_set & doc_toks_set))
    num_union = len(list(query_tok_set | doc_toks_set))
    jaccard = num_intersection / num_union
    return jaccard

def perform_jaccard_scoring(query_toks, document_toks):
    jaccard_coeff_values = {}
    for i in range(len(document_toks)):
        jaccard_coeff_i = compute_jaccard_coeff(query_toks, document_toks[i])
        jaccard_coeff_values[i] = jaccard_coeff_i
    return jaccard_coeff_values

def get_relevant_by_jaccard(jaccard_coeff_values):
    ranked_order_by_jaccard = dict(sorted(jaccard_coeff_values.items(), key=lambda item: item[1], reverse=True))
    top_5_docID = list(ranked_order_by_jaccard.keys())[:5]
    top_5_jaccard = list(ranked_order_by_jaccard.values())[:5]
    return top_5_docID, top_5_jaccard

def run_jaccard(query, document_toks):
    query_toks = cleanQuery(query)
    if(len(query_toks) == 0):
        print("no. of query tokens after preprocessing is 0. Jaccard coefficient with all documents is equal to 0")
        for i in range(5):
            print(f"{i + 1} : {docID_to_doc_mapping[i]} (0)") 
    jaccard_scores = perform_jaccard_scoring(query_toks, document_toks)
    # print(jaccard_scores)
    top_5_doc_ids, top_5_jaccard_score = get_relevant_by_jaccard(jaccard_scores)
    top_5_doc_names = getDocsFromID(docID_to_doc_mapping, top_5_doc_ids)
    print(f"Query Text = {query}")
    print(f"Query tokens after preprocessing = {query_toks}")
    print(f"Top 5 relevant documents based on the value of the Jaccard coefficient : ")
    for i in range(len(top_5_doc_names)):
        print(f"{i + 1} : {top_5_doc_names[i]} ({top_5_jaccard_score[i]})") 

In [10]:
# run_jaccard("adjpasdps dalsdnal alsndlandl", document_toks)

In [23]:
# a = np.array([1, 2, 1, 2, 3, 4, 4, 5 ,1, 2])
# print(np.unique(a, return_counts=True))

(array([1, 2, 3, 4, 5]), array([3, 3, 1, 2, 1], dtype=int64))


In [42]:
def compute_raw_term_frequency(document_toks):

    raw_term_freq = {}
    for i in range(len(document_toks)):
        raw_term_freq[i] = {}
        unique_toks, tok_freq = np.unique(document_toks[i], return_counts=True)
        for j in range(len(unique_toks)):
            raw_term_freq[i][unique_toks[j]] = tok_freq[j]
    return raw_term_freq
def generate_term_postings(document_toks):

    term_posting_lists = {}
    for i in range(len(document_toks)):
        for tok in document_toks[i]:
            if(tok not in term_posting_lists.keys()):
                term_posting_lists[tok] = [i]
            else:
                if(i not in term_posting_lists[tok]):
                    term_posting_lists[tok].append(i)
    return term_posting_lists

def compute_document_frequency(term_posting_lists):

    term_df = {}
    for term in term_posting_lists.keys():
        term_df[term] = len(term_posting_lists[term])
    return term_df

def compute_IDF(term_df, num_total_docs):

    term_idf = {}
    for term in term_df.keys():
        idf_value = np.log10(num_total_docs / (term_df[term] + 1))
        term_idf[term] = idf_value
    return term_idf

def compute_tf_weight(scheme, term, doc_tfs):

    if(scheme == "binary"):
        if(term in doc_tfs.keys()):
            return 1
        else:
            return 0
    elif(scheme == "raw_count"):
        if(term in doc_tfs.keys()):
            return doc_tfs[term]
        else:
            return 0
    elif(scheme == "term_frequency"):
        if(term in doc_tfs.keys()):
            total_terms = sum(doc_tfs.values())
            return doc_tfs[term] / total_terms
        else:
            return 0
    elif(scheme == "log_normalization"):
        if(term in doc_tfs.keys()):
            return np.log10(1 + doc_tfs[term])
        else:
            return 0
    elif(scheme == "double_normalization"):
        if(term in doc_tfs.keys()):
            t1 = 0.5
            t2 = (0.5)*(doc_tfs[term] / max(doc_tfs.values()))
            return t1 + t2
        else:
            return 0.5

def generate_tf_idf_matrices(document_toks, vocabulary_list):
    num_docs = len(list(document_toks))
    print(f"Num docs = {num_docs}")
    raw_tfs = compute_raw_term_frequency(document_toks)
    term_wise_postings = generate_term_postings(document_toks)
    term_document_freq = compute_document_frequency(term_wise_postings)
    term_idfs = compute_IDF(term_document_freq, num_docs)
    num_words = len(vocabulary_list)
    tf_idf_matrix_by_scheme = {}
    schemes_list = ['binary', 'raw_count', 'term_frequency', 'log_normalization', 'double_normalization']
    for scheme in schemes_list:
        tf_idf_matrix_by_scheme[scheme] = np.zeros((num_docs, num_words))
    for scheme in schemes_list:
        print(f"Generating for scheme : {scheme}")
        for i in tqdm(range(num_docs)):
            for j in range(num_words):
                tf_weight = compute_tf_weight(scheme, id_to_term[j], raw_tfs[i])
                idf = term_idfs[id_to_term[j]]
                tf_idf_matrix_by_scheme[scheme][i][j] = tf_weight * idf
    return raw_tfs, term_idfs, tf_idf_matrix_by_scheme
# def generate_tf_idf_matrices(document_toks, vocabulary_list):

#     num_docs = len(list(document_toks))
#     raw_term_freqs = compute_raw_term_frequency(document_toks)
#     term_wise_postings = generate_term_postings(document_toks)
#     term_document_freq = compute_document_frequency(term_wise_postings)
#     term_idfs = compute_IDF(term_document_freq, num_docs)
#     num_words = len(vocabulary_list)
#     tf_idf_by_scheme = {}
#     schemes_list = ['binary', 'raw_count', 'term_frequency', 'log_normalization', 'double_normalization']
#     for scheme in schemes_list:
#         print(f"Generating for scheme : {scheme}")
#         tf_idf_mat = pd.DataFrame(np.zeros((num_docs, num_words)), columns=vocabulary_list)
#         for i in tqdm(range(num_docs)):
#             # print(f"for doc {i}")
#             for term in vocabulary_list:
#                 tf_weight = compute_tf_weight(scheme, term, raw_term_freqs[i])
#                 idf = term_idfs[term]
#                 tf_idf_mat.iloc[i][term] = tf_weight * idf
#         tf_idf_by_scheme[scheme] = tf_idf_mat
#     return tf_idf_by_scheme

def get_query_vector(query_toks, scheme, term_idfs, vocab_len):
    num_query_toks = len(query_toks)
    query_vector = [0] * vocab_len
    query_tfs = {}
    for i in range(num_query_toks):
        query_tfs[query_toks[i]] = 0
    for i in range(num_query_toks):
        # term_id = term_to_id[query_toks[i]]
        # query_vector[term_id] += 1
        query_tfs[query_toks[i]] += 1
    for i in range(len(query_vector)):
        term_tf_weight = compute_tf_weight(scheme, id_to_term[i], query_tfs)
        # print(id_to_term[i])
        if(id_to_term[i] not in term_idfs.keys()):
            idf_val = 0
        else:
            idf_val = term_idfs[id_to_term[i]]
        # print(f"idf_val = {idf_val}")
        query_vector[i] = term_tf_weight * idf_val
    a, b = np.unique(query_vector, return_counts=True)
    # print(f"Query vector = {a} | {b}")
    return query_vector
def process_tf_idf_query(query, tf_idf_matrix_dict, idf_values, vocab_len):
    query_toks = cleanQuery(query)
    print(f"Query : {query}")
    print(f"Query tokens = {query_toks}\n")
    schemes_list = ['binary', 'raw_count', 'term_frequency', 'log_normalization', 'double_normalization']
    # schemes_list = ['binary']
    for scheme in schemes_list:
        print(f"\n---------------- Scheme : {scheme} -------------------\n") 
        query_vector = np.array(get_query_vector(query_toks, scheme, idf_values, vocab_len)).reshape((vocab_len, 1)) # v x 1
        tf_idf_matrix = np.array(tf_idf_matrix_dict[scheme]) #d x v
        document_scores = np.dot(tf_idf_matrix, query_vector)
        document_id_to_score = {}
        for i in range(document_scores.shape[0]):
            document_id_to_score[i] = document_scores[i][0]
        document_id_to_score = dict(sorted(document_id_to_score.items(), key=lambda item: item[1], reverse=True))
        top_5_doc_ids = list(document_id_to_score.keys())[:5]
        top_5_doc_names = getDocsFromID(docID_to_doc_mapping, top_5_doc_ids)
        for i in range(5):
            print(f"{i} -> {top_5_doc_names[i]} | Score = {document_id_to_score[top_5_doc_ids[i]]}")
        print("-----------------------------------------------------------")

In [20]:
raw_tf_docs, term_idfs, tf_idf_matrix_dict  = generate_tf_idf_matrices(document_toks, vocabulary_list)

Num docs = 1133
Generating for scheme : binary


100%|██████████| 1133/1133 [01:34<00:00, 12.01it/s]


Generating for scheme : raw_count


100%|██████████| 1133/1133 [01:27<00:00, 12.92it/s]


Generating for scheme : term_frequency


100%|██████████| 1133/1133 [01:51<00:00, 10.18it/s]


Generating for scheme : log_normalization


100%|██████████| 1133/1133 [01:27<00:00, 12.92it/s]


Generating for scheme : double_normalization


100%|██████████| 1133/1133 [01:34<00:00, 12.04it/s]


In [21]:
pickle.dump(tf_idf_matrix_dict, open('./tf_idf_matrices_q1.pkl', 'wb'))

In [22]:
file_to_read = open("./tf_idf_matrices_q1.pkl", "rb")
tf_idf_matrix_dict = pickle.load(file_to_read)

In [29]:
np.array(tf_idf_matrix_dict['binary']).shape

(1133, 82779)

In [43]:
input_query = input()
process_tf_idf_query(input_query, tf_idf_matrix_dict, term_idfs, len(vocabulary_list))

Query : good
Query tokens = ['good']


---------------- Scheme : binary -------------------

0 -> 1st_aid.txt | Score = 0.10409730823826448
1 -> abbott.txt | Score = 0.10409730823826448
2 -> acronyms.txt | Score = 0.10409730823826448
3 -> ads.txt | Score = 0.10409730823826448
4 -> adt_miam.txt | Score = 0.10409730823826448
-----------------------------------------------------------

---------------- Scheme : raw_count -------------------

0 -> manners.txt | Score = 8.848271200252482
1 -> practica.txt | Score = 5.621254644866283
2 -> oldtime.sng | Score = 4.684378870721901
3 -> mlverb.hum | Score = 4.059795021292315
4 -> vegan.rcp | Score = 3.8516004048157857
-----------------------------------------------------------

---------------- Scheme : term_frequency -------------------

0 -> oldtime.sng | Score = 0.007173627673387291
1 -> f_tang.txt | Score = 0.004053346515472245
2 -> popmach | Score = 0.0033579776851053057
3 -> bless.bc | Score = 0.0023658479145060108
4 -> beer.hum | Score = 