In [184]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import copy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

[nltk_data] Downloading package stopwords to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [150]:
data_dir = 'data/Humor,Hist,Media,Food'
file_names = os.listdir(data_dir) #reading the data directory to list all the files
file_paths = [(data_dir + '/' + fname) for fname in file_names] #forming file paths
docID_to_doc_mapping = {} #forming docID to doc name mapping
for i in range(len(file_names)):
    docID_to_doc_mapping[i] = file_names[i]

In [151]:
def remove_punct(tok):
    '''
        Removing punctations from tokens
    '''
    punctuations = string.punctuation
    tok = ''.join(x for x in tok if x not in punctuations)
    return tok
def remove_space(tok):
    '''
        Removing blank space toks
    '''
    tok = ''.join(x for x in tok if x != ' ')
    return tok

def preprocess_file(file_text):
    '''
        This function preprocesses the file text.
        Input: file_text in string form represting the text of a file
        Returns: cleaned_toks, word tokens present in the file after preprocessing
    '''

    #converting the text to lowercase
    ftext = file_text.lower()

    #performing word tokenization
    file_toks = word_tokenize(ftext)

    #removing the stopwords from tokens
    stop_words = list(set(stopwords.words('english')))
    file_toks = [tok for tok in file_toks if tok not in stop_words]

    #removing punctuation marks from tokens
    toks_no_punct = []
    for tok in file_toks:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)
    
    #Removing blank space tokens
    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)

    return cleaned_toks

def cleanQuery(query_text):
    '''
        Preprocessing the query text
        Input: query_text, string of the phrase query text
        Returns: cleaned_toks, an array containg the preprocessed query tokens
    '''

    #We perform the same preprocessing steps on the query as we did for the file text

    #converting the text to lowercase
    qtext = query_text.lower()
    
    #performing word tokenization
    query_toks = word_tokenize(qtext)
    
    #removing the stopwords from tokens
    stop_words = list(set(stopwords.words('english')))
    query_toks = [tok for tok in query_toks if tok not in stop_words]
    
    #removing punctuation marks from tokens
    toks_no_punct = []
    for tok in query_toks:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)
    
    #Removing blank space tokens
    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)
    
    return cleaned_toks

In [152]:
def read_file(fpaths):
    '''
        Reads the files and preprocess every file's text to form word tokens for every file.
        Returns a 2-D list containing word tokens for every file
    '''
    file_tokens = []
    for fpath in fpaths:
        f = open(fpath, 'r', encoding='utf-8', errors='ignore') #open the file
        ftxt_unprocessed = f.read() #read the text of the file
        ftoks = preprocess_file(ftxt_unprocessed) #preprocessing the text to form word tokens
        file_tokens.append(ftoks)
    return file_tokens


In [153]:
def getDocsFromID(docID_to_doc, doc_IDs):
    '''
        Given a list of document IDs, it outputs the document names corresponding to thos IDs.
        Input: docID_to_docs (mapping between docID -> doc_name), docIDs - list of input document IDs
        Returns: doc_names - list of doc_names corresponding to document IDs in doc_IDs
    '''
    doc_names = []
    for doc_ID in doc_IDs:
        doc_names.append(docID_to_doc[doc_ID])
    return doc_names

In [154]:
document_toks = read_file(file_paths)

In [155]:
vocabulary_set = set()
for doc_tok in document_toks:
    for tok in doc_tok:
        vocabulary_set.add(tok)
vocabulary_list = list(vocabulary_set)

In [156]:
def compute_jaccard_coeff(query_toks, doc_toks):
    query_tok_set = set(query_toks)
    doc_toks_set = set(doc_toks)
    num_intersection = len(list(query_tok_set & doc_toks_set))
    num_union = len(list(query_tok_set | doc_toks_set))
    jaccard = num_intersection / num_union
    return jaccard

def perform_jaccard_scoring(query_toks, document_toks):
    jaccard_coeff_values = {}
    for i in range(len(document_toks)):
        jaccard_coeff_i = compute_jaccard_coeff(query_toks, document_toks[i])
        jaccard_coeff_values[i] = jaccard_coeff_i
    return jaccard_coeff_values

def get_relevant_by_jaccard(jaccard_coeff_values):
    ranked_order_by_jaccard = dict(sorted(jaccard_coeff_values.items(), key=lambda item: item[1], reverse=True))
    top_5_docID = list(ranked_order_by_jaccard.keys())[:5]
    top_5_jaccard = list(ranked_order_by_jaccard.values())[:5]
    return top_5_docID, top_5_jaccard

def run_jaccard(query, document_toks):
    query_toks = cleanQuery(query)
    if(len(query_toks) == 0):
        print("no. of query tokens after preprocessing is 0. Jaccard coefficient with all documents is equal to 0")
        for i in range(5):
            print(f"{i + 1} : {docID_to_doc_mapping[i]} (0)") 
    jaccard_scores = perform_jaccard_scoring(query_toks, document_toks)
    print(jaccard_scores)
    top_5_doc_ids, top_5_jaccard_score = get_relevant_by_jaccard(jaccard_scores)
    top_5_doc_names = getDocsFromID(docID_to_doc_mapping, top_5_doc_ids)
    print(f"Query Text = {query}")
    print(f"Query tokens after preprocessing = {query_toks}")
    print(f"Top 5 relevant documents based on the value of the Jaccard coefficient : ")
    for i in range(len(top_5_doc_names)):
        print(f"{i + 1} : {top_5_doc_names[i]} ({top_5_jaccard_score[i]})") 

In [187]:
def compute_raw_term_frequency(document_toks):

    raw_term_freq = {}
    for i in range(len(document_toks)):
        raw_term_freq[i] = {}
        unique_toks, tok_freq = np.unique(document_toks[i], return_counts=True)
        for j in range(len(unique_toks)):
            raw_term_freq[i][unique_toks[j]] = tok_freq[j]
    return raw_term_freq
        # for term in document_toks[i]:
        #     if(term in raw_term_freq[i].keys()):
        #         raw_term_freq[i][term] += 1
        #     else:
        #         raw_term_freq[i][term] = 1
def generate_term_postings(document_toks):

    term_posting_lists = {}
    for i in range(len(document_toks)):
        for tok in document_toks[i]:
            if(tok not in term_posting_lists.keys()):
                term_posting_lists[tok] = [i]
            else:
                if(i not in term_posting_lists[tok]):
                    term_posting_lists[tok].append(i)
    return term_posting_lists

def compute_document_frequency(term_posting_lists):

    term_df = {}
    for term in term_posting_lists.keys():
        term_df[term] = len(term_posting_lists[term])
    return term_df

def compute_IDF(term_df, num_total_docs):

    term_idf = {}
    for term in term_df.keys():
        idf_value = np.log10(num_total_docs / (term_df[term] + 1))
        term_idf[term] = idf_value
    return term_idf

def compute_tf_weight(scheme, term, doc_tfs):

    if(scheme == "binary"):
        if(term in doc_tfs.keys()):
            return 1
        else:
            return 0
    elif(scheme == "raw_count"):
        if(term in doc_tfs.keys()):
            return doc_tfs[term]
        else:
            return 0
    elif(scheme == "term_frequency"):
        if(term in doc_tfs.keys()):
            total_terms = sum(doc_tfs.values())
            return doc_tfs[term] / total_terms
        else:
            return 0
    elif(scheme == "log_normalization"):
        if(term in doc_tfs.keys()):
            return np.log10(1 + doc_tfs[term])
        else:
            return 0
    elif(scheme == "double_normalization"):
        if(term in doc_tfs.keys()):
            t1 = 0.5
            t2 = (0.5)*(doc_tfs[term] / max(doc_tfs.values()))
            return t1 + t2
        else:
            return 0.5

def generate_tf_idf_matrices(document_toks, vocabulary_list):

    num_docs = len(list(document_toks))
    raw_term_freqs = compute_raw_term_frequency(document_toks)
    term_wise_postings = generate_term_postings(document_toks)
    term_document_freq = compute_document_frequency(term_wise_postings)
    term_idfs = compute_IDF(term_document_freq, num_docs)
    num_words = len(vocabulary_list)
    tf_idf_by_scheme = {}
    schemes_list = ['binary', 'raw_count', 'term_frequency', 'log_normalization', 'double_normalization']
    for scheme in schemes_list:
        print(f"Generating for scheme : {scheme}")
        tf_idf_mat = pd.DataFrame(np.zeros((num_docs, num_words)), columns=vocabulary_list)
        for i in tqdm(range(num_docs)):
            # print(f"for doc {i}")
            for term in vocabulary_list:
                tf_weight = compute_tf_weight(scheme, term, raw_term_freqs[i])
                idf = term_idfs[term]
                tf_idf_mat.iloc[i][term] = tf_weight * idf
        tf_idf_by_scheme[scheme] = tf_idf_mat
    return tf_idf_by_scheme
        


In [188]:
tf_idf_mat_by_scheme = generate_tf_idf_matrices(document_toks, vocabulary_list)

Generating for scheme : binary


100%|██████████| 1133/1133 [1:33:53<00:00,  4.97s/it]


Generating for scheme : raw_count


100%|██████████| 1133/1133 [1:16:28<00:00,  4.05s/it]


Generating for scheme : term_frequency


100%|██████████| 1133/1133 [1:31:42<00:00,  4.86s/it]


Generating for scheme : log_normalization


100%|██████████| 1133/1133 [1:18:30<00:00,  4.16s/it]


Generating for scheme : double_normalization


100%|██████████| 1133/1133 [1:13:18<00:00,  3.88s/it]


In [189]:
tf_idf_file = open("tf_idf_matrices", "wb")
pickle.dump(tf_idf_mat_by_scheme, tf_idf_file)

In [195]:
tf_idf_mat_by_scheme['raw_count'].head(10)

Unnamed: 0,unrecycled,gripology,sysadmin,perchlorate,waterfountain,fungus,samwich,41364,meatandtwoveg,bosix,...,barnum,recalling,012287,superbabs,devoisters,resister,wpa,18914,44inch,freeware
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# def perform_ranking_by_tf_idf(tf_idf_mat_by_scheme, query):
#     schemes_list = ['binary', 'raw_count', 'term_frequency', 'log_normalization', 'double_normalization']
#     query_toks = cleanQuery(query)
#     for scheme in schemes_list:
#         score_with_doc = {}
#         for i in range(len(document_toks)):
            

In [196]:
num_docs = len(list(document_toks))
raw_term_freqs = compute_raw_term_frequency(document_toks)
term_wise_postings = generate_term_postings(document_toks)
term_document_freq = compute_document_frequency(term_wise_postings)
term_idfs = compute_IDF(term_document_freq, num_docs)
num_words = len(vocabulary_list)
# tf_idf_by_scheme = {}
schemes_list = ['raw_count']
for scheme in schemes_list:
    print(f"Generating for scheme : {scheme}")
    tf_idf_mat = pd.DataFrame(np.zeros((num_docs, num_words)), columns=vocabulary_list)
    for i in tqdm(range(num_docs)):
        # print(f"for doc {i}")
        for term in vocabulary_list:
            tf_weight = compute_tf_weight(scheme, term, raw_term_freqs[i])
            idf = term_idfs[term]
            tf_idf_mat.iloc[i][term] = tf_weight * idf
    tf_idf_mat_by_scheme[scheme] = tf_idf_mat
# return tf_idf_by_scheme

Generating for scheme : raw_count


100%|██████████| 1133/1133 [1:19:06<00:00,  4.19s/it]
