This notebook is to test how to use the function in library.

In [1]:
import pandas as pd
import numpy as np
import os
import random
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from association_mining import *
import utils


In [2]:
data_root = os.path.join(os.getcwd(),'input')

In [3]:
df = utils.load_data_from_local(data_root)

Meta data size: 9022
c:\Users\pywong\Desktop\PolyU\COMP5434 Big Data Computing\Assignment\COMP5434-Big-Data-Computing\input\subset\subset\document_parses\pdf_json
total json files: 12000


100%|██████████| 12000/12000 [00:53<00:00, 225.63it/s]
100%|██████████| 8083/8083 [00:39<00:00, 207.11it/s]
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pywong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
feature_size = 2**12
random.seed(10)

In [5]:
count_vectorizer = CountVectorizer(max_features=feature_size)
X_count = count_vectorizer.fit_transform(df['processed_text'].values).toarray().T

Sample script to use the function

In [16]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import gensim.downloader as api



def get_target_document_index(X,token_list,target_word_list,model_name='glove-wiki-gigaword-100',top_feature=20,top_document=100):
    
    '''
    Compare each target word to all token used as feature and find the top related document index.
    Using gensim model for semantic analysis by default using glove-wiki-gigaword-100.
    Return a dictionary of most related documents index for each target word.
    
    Argument:
    
    - X:
    A numpy array of characteristic matrix with row as number of feature and column as number of document. 
    Recommend to use the token count as feature.
    
    - token_list:
    A list of string that use to build the feature of the characteristic matrix.
    
    - target_word_list:
    A list of string that all token compare with.
    Each target word can be any string that is not in the token_list but must be in the model.
    
    - model_name:
    A string of model name that use to give a similarity of score between the target worad and the token used.
    Must be a valid model name that can be called by using Gensim API.
    
    - top_feature:
    An integer number of most related features compare with the target word.
    
    - top_document:
    An integer number of most related documents compare with the target word.
    '''
    
    print(f'Using Model: {model_name}\n')
    
    model = api.load(model_name)
    
    # Check all target words are in the model
    for target_word in target_word_list:
        if target_word not in model:
            ValueError(f'Argument: target_word is not in model {model_name}')
            
    # Check input top_feature and token_list size
    if len(token_list) <= top_feature:
        print(f'Warning: Argument top_feature ({top_feature}) is greater than or equal to token_list size ({len(token_list)})')
        print('No feature filtering will be performed.\n')
        
    # Check the proportion of token that is not in model.
    token_not_exist = 0
    for i in token_list:
        if i not in model:
            token_not_exist += 1
            
    print(f'Token not in model: {token_not_exist}/{len(token_list)} ({round(token_not_exist/(len(token_list))*100,1)}%)')
    

    # Create a result dictionary for all target words
    
    result_dict = {}
    for i in range(0,len(target_word_list)):
        
        target_word = target_word_list[i]
        print(f'Comparing target word {i}/{len(target_word_list)}: {target_word}')
        
        token_score_df = get_target_word_similarity(model,token_list,target_word)
        doc_to_keep = get_most_similar_document(X,token_score_df,token_list,top_feature=top_feature,top_document=top_document)
        result_dict[target_word] = doc_to_keep
        
    return result_dict

        
def get_target_word_similarity(model,token_list,target_word):
    '''
    Return the similarity score for all tokens used as feature compare with the target word.
    Using cosine similarity by default.
    '''
    
    target_word_feature = model[target_word]
    target_word_norm = np.linalg.norm(target_word_feature)
    
    # List of tokne used to build feature
    # token_list = list(vectorizer.get_feature_names_out())
    
    token_score_list = []
    
    # token_exist = 0
    # token_not_exist = 0
    
    # Use cosine similarity between the target word and the token using feature created from gensim model.
    for token in token_list:
        if token in model:
            token_norm = np.linalg.norm(model[token])
            score = (model[token] @ target_word_feature.T)/(token_norm*target_word_norm)
            # token_exist += 1
        else:
            score = None
            # token_not_exist += 1
            
        token_score_list.append(score)

    # print(f'Number of token exists in model:{token_exist}/{token_exist+token_not_exist} ({round(token_exist/(token_exist+token_not_exist)*100,1)}%)')

    df = pd.DataFrame({'token':token_list,'score':token_score_list})
    df = df.sort_values(by=['score'],ascending=False)
    df = df.fillna(0)
    
    return df

    
def get_most_similar_document(X,token_score_df,token_list,top_feature=20,top_document=100):
    
    '''
    - X:
    A 2D numpy array characteristic matrix with number of token as row, number of document as column.
    Prefer using count of token occurrence as feature.
    
    - token_score_df:
    A pandas dataframe with the similarity score for each token compare with the target word.
    
    - top_feature:
    An integer number of most related features compare with the target word.
    
    - top_document:
    An integer number of most related documents compare with the target word.
    
    Return a list of document number with zero based index.
    The list is truncated by the size of top_document.
    The document index is in the descending order of similarity score followed by the document index in the characteristic matrix.
    '''
    
    keep_feature = list(token_score_df.head(top_feature)['token'])
    
    # Loop through all token in characteristic matrix to find the feature to keep
    token_index_list = []
    for i in range(0,len(token_list)):
        if token_list[i] in keep_feature:
            token_index_list.append(i)
            
    # Filter to only contain the most related features
    X_modify = X[token_index_list,:]
    
    # Sum all remaining feature values to create a score for each document
    doc_score = np.sum(X_modify,axis=0)
    doc_rank = ss.rankdata(doc_score,method='min')
    
    distinct_rank = list(set(doc_rank))
    distinct_rank.sort()
    
    doc_to_keep = []

    # When multiple documents have the same rank, it may exceed the number of top document limit.
    # Put the document with the lowest rank into the list first.
    # Cap the list size with the number of top document limit.
    for j in distinct_rank:

        for i in range(0,len(doc_rank)):
            rank = doc_rank[i]

            if rank == j and len(doc_to_keep) < top_document:
                doc_to_keep.append(i)
                if len(doc_to_keep) >= top_document:
                    break

    return doc_to_keep

In [8]:
target_word_list = ['vaccination','symptom','critical','infection','bitcoin'] # not in feature but in model
token_list = list(count_vectorizer.get_feature_names_out())

In [17]:
index_dict = get_target_document_index(X_count,token_list,target_word_list)
index_dict

Using Model: glove-wiki-gigaword-100

Token not in model: 28/4096 (0.7%)
Comparing target word 0/5: vaccination
Comparing target word 1/5: symptom
Comparing target word 2/5: critical
Comparing target word 3/5: infection
Comparing target word 4/5: bitcoin


{'vaccination': [2,
  11,
  16,
  42,
  72,
  92,
  97,
  110,
  127,
  139,
  146,
  149,
  152,
  153,
  175,
  185,
  194,
  229,
  236,
  248,
  262,
  264,
  272,
  277,
  279,
  289,
  306,
  307,
  322,
  338,
  354,
  361,
  376,
  381,
  390,
  409,
  410,
  411,
  434,
  441,
  456,
  468,
  473,
  478,
  481,
  483,
  492,
  495,
  499,
  506,
  511,
  521,
  525,
  530,
  544,
  559,
  560,
  562,
  566,
  582,
  583,
  585,
  605,
  609,
  613,
  617,
  636,
  650,
  651,
  659,
  665,
  675,
  691,
  715,
  733,
  745,
  760,
  761,
  765,
  772,
  773,
  774,
  778,
  788,
  799,
  801,
  807,
  808,
  809,
  818,
  831,
  842,
  843,
  846,
  847,
  877,
  894,
  902,
  903,
  906],
 'symptom': [8,
  11,
  47,
  49,
  72,
  91,
  110,
  111,
  117,
  143,
  151,
  174,
  185,
  194,
  203,
  206,
  212,
  215,
  229,
  243,
  246,
  252,
  272,
  279,
  289,
  290,
  297,
  306,
  316,
  324,
  334,
  338,
  344,
  354,
  360,
  361,
  376,
  377,
  381,
  387,
  394,
 

In [15]:
index_dict['bitcoin']

[1,
 4,
 5,
 8,
 9,
 10,
 11,
 13,
 15,
 18,
 19,
 30,
 31,
 37,
 38,
 40,
 44,
 47,
 49,
 50,
 56,
 57,
 58,
 59,
 60,
 64,
 68,
 69,
 72,
 75,
 76,
 81,
 83,
 84,
 95,
 96,
 97,
 101,
 102,
 103,
 104,
 107,
 108,
 110,
 115,
 122,
 123,
 124,
 126,
 127,
 132,
 133,
 134,
 137,
 140,
 142,
 143,
 149,
 151,
 153,
 155,
 160,
 161,
 163,
 165,
 166,
 167,
 169,
 170,
 171,
 172,
 174,
 176,
 177,
 178,
 181,
 185,
 186,
 188,
 189,
 192,
 193,
 195,
 198,
 199,
 200,
 204,
 207,
 212,
 213,
 214,
 215,
 216,
 222,
 224,
 226,
 229,
 231,
 236,
 238]