# Gensim LDA Topic Modelling Evaluation on Link data

In [1]:
from pre_process import *

Loading Libraries...Just a moment please!...


[nltk_data] Downloading package stopwords to /Users/gnx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/gnx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Path for the Source data files

In [4]:
path = '/Users/gnx/MDS/Project-Indeed/data/'

### Evaluation specific variable declaration

In [5]:
#Loading the data
art_train = load_data(f"{path}article.json")
art_test = load_data(f"{path}pred_article.json")
all_article = pd.concat([art_train, art_test])
cp_train = load_data(f"{path}careerpathpage.json")
cl_train = load_data(f"{path}coverletter.json")
res_train = load_data(f"{path}resumesamplepage.json")

#Creating content title column which is a combination of title and content
all_article["cont_title"] = all_article["contentTitle"] + " " +all_article["content"]
art_test["cont_title"] = art_test["contentTitle"] + " " +art_test["content"]
cp_train["cont_title"] = cp_train["h1"] + " " + cp_train["primaryContent"]
cl_train["cont_title"] = cl_train["title"] + " " + cl_train["contentA"] + " " +cl_train["contentB"]
res_train["cont_title"] = res_train["title"] + " " + res_train["contentA"] +" "+res_train["contentB"]

#Removing hyperlinks and html tags from the corpus
rltd_arts, all_article = get_corpus(all_article, "cont_title")
art_test = remove_hyperlinks_html_tags(art_test, "cont_title")
cp_train = remove_hyperlinks_html_tags(cp_train, "cont_title")
cl_train = remove_hyperlinks_html_tags(cl_train, "cont_title")
res_train  = remove_hyperlinks_html_tags(res_train, "cont_title")

In [6]:

#Creating a list of cont_title, titles and url_routes
cont = all_article["cont_title"].tolist() + cp_train["cont_title"].tolist() + cl_train["cont_title"].tolist() + res_train["cont_title"].tolist()
all_title = all_article["contentTitle"].tolist() + cp_train["h1"].tolist() + cl_train["title"].tolist() + res_train["title"].tolist()
url_route = all_article["urlRoute"].tolist() + cp_train["urlRoute"].tolist() + cl_train["urlRoute"].tolist() + res_train["urlRoute"].tolist()

#Getting all document ids
art_train_ids = [id_["$oid"] for id_ in art_train["_id"].tolist()]
art_test_ids = [id_["$oid"] for id_ in art_test["_id"].tolist()]
cp_train_ids = [id_["$oid"] for id_ in cp_train["_id"].tolist()]
cl_train_ids = [id_["$oid"] for id_ in cl_train["_id"].tolist()]
res_train_ids = [id_["$oid"] for id_ in res_train["_id"].tolist()]
all_ids = art_train_ids + art_test_ids + cp_train_ids + cl_train_ids + res_train_ids

#We are concatenating category and url column for loading link data and avoiding any duplicates 
all_article["cat_url"] = all_article["category"]+ "|" + all_article["urlRoute"] 
cp_train["cat_url"] = "careers|" + cp_train["urlRoute"]
cl_train["cat_url"] = "career-advice|" + cl_train["urlRoute"]
res_train["cat_url"] = res_train["category"]+ "|" + res_train["urlRoute"] 

#Creating a list of all category url data
cat_url = all_article["cat_url"].tolist() + cp_train["cat_url"].tolist() + cl_train["cat_url"].tolist() + res_train["cat_url"].tolist()

#Creating respective dictionary required for processing the link data
# i2title_lookup = {i:title for i, title in enumerate(all_title)}
url2i_lookup = {url:i for i, url in enumerate(cat_url)}
id2index = {id:i for i, id in enumerate(all_ids)}
id2index_rev = {i:id for i, id in enumerate(all_ids)}
id2title = {id:title for id, title in zip(all_ids, all_title)}
# title2i_lookup = {title:i for i, title in enumerate(all_title)}

In [7]:
assert len(set(all_ids)) == len(set(cat_url)) == len(set(url2i_lookup)) == len(set(id2title))

In [9]:
print("# of related articles:", len(rltd_arts))

# of related articles: 14862


## Topic distribution embeddings

In [15]:
f = open(f"{path}pred_article.json", encoding="utf-8")
test_train = []
for line in f:
    test_train.append(json.loads(line))
test_train = pd.DataFrame(pd.DataFrame(test_train))

In [23]:
rltd_arts, art_train = get_corpus(art_train, "content")
cl_train = remove_hyperlinks_html_tags(cl_train, "contentA")
res_train  = remove_hyperlinks_html_tags(res_train, "contentA")

In [24]:
cp_train  = remove_hyperlinks_html_tags(cp_train, "primaryContent")

In [34]:
from gensim.utils import simple_preprocess
def strip_newline(series):
    """
    stripping out new line
    """                    
    return [review.replace('\n','') for review in series]
def sent_to_words(sentences):
    """
    Converting sentence to words
    """                
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    """
    Used for removing stopwords
    """            
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    """
    Find the bigrams in the content of the article
    """        
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_bigram(df):
    if 'content' in df:
        df_content = strip_newline(df.content)
    elif 'contentA' in df:
        df_content = df['contentA'] + ' ' + df['contentB']
    elif 'primaryContent' in df:
        df_content = df['primaryContent']
    words = list(sent_to_words(df_content))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram

In [35]:
art_bigram = get_bigram(art_train)
cp_bigram = get_bigram(cp_train)
cl_bigram = get_bigram(cl_train)
res_bigram = get_bigram(res_train)

In [39]:
# prediction on new articles
test_art_bigram = get_bigram(test_train)

### Get the model (same parameters we used for best models)

In [38]:
from gensim.corpora.dictionary import Dictionary
train_dict = Dictionary(art_bigram)
train_corpus = [train_dict.doc2bow(text) for text in art_bigram]
lda_30 = gensim.models.LdaModel(train_corpus,
                            num_topics=30,
                            id2word=train_dict,
                            chunksize=100,
                            random_state=10,
                            passes=20,
                            alpha=0.01)

In [40]:
cp_corpus = [train_dict.doc2bow(text) for text in cp_bigram]
cl_corpus = [train_dict.doc2bow(text) for text in cl_bigram]
res_corpus = [train_dict.doc2bow(text) for text in res_bigram]

In [41]:
# prediction on new articles
test_corpus = [train_dict.doc2bow(text) for text in test_art_bigram]

### Prepare for Topic distribution embeddings

In [42]:
urlRoute_set = set()
urlRoute_list = []
for urlRoute in art_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in cp_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in cl_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in res_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in test_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)

We used the best model from above to generate the document distributions over the 30 topics. Frist thing we need to do is to pad those topics with 0 probability, so that the matrix we come up with will have consistant dimension.

In [43]:
# Padding the document distributions over all the (30) topics
def padding_matrix(model, train_corpus):
    padding_dist_matrix = []
    for i, dist in enumerate(model.get_document_topics(train_corpus)):
        padding_dist_row = []
        if len(dist) < 30:
            dist_id = [topic_id for topic_id, prob in dist]
            dist_id_prob_dict = {topic_id:prob for topic_id, prob in dist}
            for i in range(30):
                if i not in dist_id:
                    padding_dist_row.append(0)
                elif i in dist_id:
                    padding_dist_row.append(dist_id_prob_dict[i])
            padding_dist_matrix.append(padding_dist_row)
        else:
            prob_list = [prob for topic_id, prob in dist]
            padding_dist_matrix.append(prob_list)
    return padding_dist_matrix

In [44]:
art_padding_matrix = padding_matrix(lda_30, train_corpus)
cp_padding_matrix = padding_matrix(lda_30, cp_corpus)
cl_padding_matrix = padding_matrix(lda_30, cl_corpus)
res_padding_matrix = padding_matrix(lda_30, res_corpus)

In [45]:
# prediction on new articles
test_padding_matrix = padding_matrix(lda_30, test_corpus)

In [46]:
general_urlRoute = [art_train['urlRoute'].tolist(), cp_train['urlRoute'].tolist(), cl_train['urlRoute'].tolist(), res_train['urlRoute'].tolist(), test_train['urlRoute'].tolist()]
general_embedding_dict = [art_padding_matrix, cp_padding_matrix, cl_padding_matrix, res_padding_matrix, test_padding_matrix]

In [47]:
urlRoute_embedding_dict = defaultdict(list)
count = 0
for i, matrix in enumerate(general_embedding_dict):
    for j, doc_embedding in enumerate(matrix):
        urlRoute_embedding_dict[general_urlRoute[i][j]].append(general_embedding_dict[i][j])

### make sure the embedding is on the right order

In [48]:
from tqdm import tqdm
import sklearn

In [49]:
urlRoute_embeddings = defaultdict(list)
count = 0
for urlRoute in urlRoute_list:
    
    if len(urlRoute_embedding_dict[urlRoute]) == 1:
        urlRoute_embeddings[urlRoute] = urlRoute_embedding_dict[urlRoute]
        count += 1
    elif len(urlRoute_embedding_dict[urlRoute]) == 2:
        continue

In [50]:
embedding_rep = []
for embeddings in urlRoute_embedding_dict:
    row_embedding = []
    for embedding in urlRoute_embedding_dict[embeddings]:
        row_embedding.append(embedding)
    embedding_rep.extend(row_embedding)

In [51]:
embedding_rep = np.array(embedding_rep)
embedding_rep.shape

(16281, 30)

### Cosine similarity scores for topic distribution embeddings

In [52]:
np_rep = np.zeros([len(embedding_rep), len(embedding_rep)]) 
# numpy vector representation on cos_sim
for i, row in tqdm(enumerate(np_rep)):

    sent1 = embedding_rep[i]
    sent2_list = embedding_rep
    cosine_sim = sklearn.metrics.pairwise.cosine_similarity([sent1], sent2_list)
    np_rep[i] = cosine_sim[0]

16281it [01:25, 190.45it/s]


In [53]:
np_rep

array([[1.        , 0.14060841, 0.2673521 , ..., 0.14129131, 0.5961785 ,
        0.90125181],
       [0.14060841, 1.        , 0.03289206, ..., 0.05985334, 0.19436421,
        0.16844442],
       [0.2673521 , 0.03289206, 1.        , ..., 0.05523276, 0.44670024,
        0.19337641],
       ...,
       [0.14129131, 0.05985334, 0.05523276, ..., 1.        , 0.07953608,
        0.16975831],
       [0.5961785 , 0.19436421, 0.44670024, ..., 0.07953608, 1.        ,
        0.58775112],
       [0.90125181, 0.16844442, 0.19337641, ..., 0.16975831, 0.58775112,
        1.        ]])

### Load Link Data

1. Group by `sent1`
2. Padding the dataframe with 0 visit articles.
3. take the average cosine similarity score.

In [55]:
all_files = [f'{path}Pageview_matrix_20210511.csv']
print(f"Link files loaded to the system are:\n {all_files[0]}")

linkdf = pd.read_csv(all_files[0], index_col=None, header=0)
link1 = linkdf["link_1"].apply(lambda x: "/".join(x.split("/")[-2:]))
link2 = linkdf["link_2"].apply(lambda x: "/".join(x.split("/")[-2:]))
visit = linkdf["visitor_count"]

linkdf = pd.DataFrame(link1)
linkdf["link_2"] = link2
linkdf["visitor_count"] = visit

Link files loaded to the system are:
 /Users/gnx/MDS/Project-Indeed/data/Pageview_matrix_20210511.csv


In [56]:
def link_csv(df, type="ids"):
    """
    Transforming links in the link dataframe to title names
    """
    links1 = []
    links2 = []    
    link_dict = defaultdict(list)
    for i, (l1, l2, v) in enumerate(zip(df["link_1"], df["link_2"], df["visitor_count"])):
        if i % 1 == 0:
            print(f"Number of records processed : {i+1}", end="\r")  
        l1 = "|".join(l1.split("/")[-2:])
        l2 = "|".join(l2.split("/")[-2:])
        if l1 in cat_url and l2 in cat_url:
            index = url2i_lookup[l1]
            link_dict["link1"].append(id2index_rev[index]) if type == "ids" else link_dict["link1"].append(id2title[id2index_rev[index]])

            index = url2i_lookup[l2]            
            link_dict["link2"].append(id2index_rev[index]) if type == "ids" else link_dict["link2"].append(id2title[id2index_rev[index]])
            link_dict["visit"].append(int(v))                    
        else:
            continue
    return link_dict

In [57]:
print("Load Link file ids:\n")
final_dict = link_csv(linkdf, "ids")

Load Link file ids:

Number of records processed : 194374

In [58]:
print("Load Link file titles:\n")
final_dict_ = link_csv(linkdf, "title")

Load Link file titles:

Number of records processed : 194374

In [59]:
link_df = pd.DataFrame.from_dict(final_dict)
link_df

Unnamed: 0,link1,link2,visit
0,5ceeecc3f6610e04a9091b36,5ceeecc3f6610e04a9091a90,2083
1,5ceeecc3f6610e04a9091b1f,5ceeecc3f6610e04a9091ad8,428
2,5ceeecc3f6610e04a9091b36,5ceeecc3f6610e04a9091ac2,322
3,5ceeecc3f6610e04a9091b1f,5ceeecc3f6610e04a9091a90,307
4,5ceeecc3f6610e04a9091b1f,5ceeecc3f6610e04a9091ac2,295
...,...,...,...
190681,5ceeecc3f6610e04a9091a6a,5e68fba1ca80a8004009e2e7,2
190682,60944b81bd2003003b6cbac1,6094460cbd2003003b6cb96c,2
190683,60944783bd2003003b6cb9d6,6094475b01df0700449da41f,2
190684,5e0fbce6faa4a8dd50bf3ca9,5e0fbbb2faa4a8dd50bf3990,2


In [60]:
link_df_titles = pd.DataFrame.from_dict(final_dict_)
link_df_titles

Unnamed: 0,link1,link2,visit
0,List of Weaknesses: 10 Things To Say in an Interview,39 Strengths and Weaknesses to Discuss in a Job Interview,2083
1,125 Common Interview Questions and Answers (With Tips),21 Job Interview Tips: How To Make a Great Impression,428
2,List of Weaknesses: 10 Things To Say in an Interview,"How to Answer ""Tell Me About Yourself"" (Tips and Example Answers)",322
3,125 Common Interview Questions and Answers (With Tips),39 Strengths and Weaknesses to Discuss in a Job Interview,307
4,125 Common Interview Questions and Answers (With Tips),"How to Answer ""Tell Me About Yourself"" (Tips and Example Answers)",295
...,...,...,...
190681,"Interview Question: ""What are You Passionate About?""",10 Closing Statements to Use After an Interview,2
190682,How to Get an Undergraduate Law Internship,8 Steps To Write a Daily Construction Report (With Tips),2
190683,How To Become Construction Estimator in 3 Steps,How To Become an Art Therapist (With 7 Steps),2
190684,How To Write an Application Letter (With Examples),Self-Introduction Tips and Tricks (with Examples),2


In [61]:
def compute_ratios(data, cosine_scores):
    """
    Compute Visit to non-visit ratios
    """
    visit = cosine_scores * data
    visit_avg = np.sum(visit)/np.sum(data)
    data_inv = 1 - data
    non_visit = cosine_scores * data_inv
    non_visit_avg = np.sum(non_visit)/np.sum(data_inv)

    return visit_avg/non_visit_avg

In [62]:
avg_np_rep = np.mean(np_rep, axis = 0)

In [63]:
avg_np_rep

array([0.22591075, 0.16661671, 0.2104535 , ..., 0.20449436, 0.25713239,
       0.24102602])

In [64]:
len(np_rep[0])

16281

### Compute ratio for topic distribution and Tf-idf

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
def generate_link_data(title, all_title, filter_cond, df, lookup1, lookup2):
    """
    Generate link data in matrix for vectorization
    """    
    link_data = np.zeros((len(title),len(all_title)))
    print(link_data.shape)
    df = df[df["visit"]>filter_cond]
    for i, (s1, s2, v) in enumerate(zip(df["link1"], df["link2"], df["visit"])):
        if i % 1 == 0:
            print(f"Number of records processed : {i+1}", end="\r")  
        link_data[lookup1[s1]][lookup2[s2]]=1
    return link_data

In [68]:
content = []
general_train_df = [art_train, cp_train, cl_train, res_train, test_train]
for df in general_train_df:
    if 'content' in df:
        content.extend(df['contentTitle'] + ' ' + df['content'])
    elif 'contentA' in df:
        content.extend(df['title'] + ' ' + df['contentA'] + ' ' + df['contentB'])
    elif 'primaryContent' in df:
        content.extend(df['h1'] + ' ' + df['primaryContent'])

In [69]:
tfidf = TfidfVectorizer()
vecs = tfidf.fit_transform(content)
tfidf_cosine_score = sklearn.metrics.pairwise.cosine_similarity(vecs)

In [70]:
tfidf_cosine_score.shape

(16281, 16281)

In [71]:
np.random.seed(seed=42)

np.random.shuffle(all_ids)

train = all_ids[:int(len(all_ids)*0.8)]
test = all_ids[int(len(all_ids)*0.8):]

print(f"Number of ids allocated to train: {len(train)}")
print(f"Number of ids allocated to test: {len(test)}")

Number of ids allocated to train: 13024
Number of ids allocated to test: 3257


In [72]:
train_links = set(link_df["link1"]) & set(train)
test_links = set(link_df["link1"]) & set(test)

print(f"Number of train title in link dataframe: {len(train_links)}")
print(f"Number of test title in link dataframe: {len(test_links)}")

Number of train title in link dataframe: 2975
Number of test title in link dataframe: 707


In [73]:
assert len(train_links) + len(test_links) == len(set(link_df["link1"]) & set(train)) + len(set(link_df["link1"]) & set(test))

In [75]:
tcs = np.array([id2index[t]for t in train])
print(f"Train ids : {tcs}")
assert tcs.shape[0] == len(train)

train_cs = np_rep[tcs]
print(f"Train cosine scores shape: {train_cs.shape}")
assert tcs.shape[0] == train_cs.shape[0]

train_tf = tfidf_cosine_score[tcs]
print(f"Train TF-IDF scores shape: {train_tf.shape}")
assert tcs.shape[0] == train_cs.shape[0]

Train ids : [13633  1921 12140 ... 15436 15768  2376]
Train cosine scores shape: (13024, 16281)
Train TF-IDF scores shape: (13024, 16281)


In [76]:
tcs = np.array([id2index[t]for t in test])
print(f"Test ids : {tcs}")
assert tcs.shape[0] == len(test)

test_cs = np_rep[tcs]
print(f"Test cosine scores shape: {test_cs.shape}")
assert tcs.shape[0] == test_cs.shape[0]

test_tf = tfidf_cosine_score[tcs]
print(f"Test TF-IDF scores shape: {test_tf.shape}")
assert tcs.shape[0] == test_tf.shape[0]

Test ids : [ 8551  3839 15427 ...   860 15795  7270]
Test cosine scores shape: (3257, 16281)
Test TF-IDF scores shape: (3257, 16281)


In [77]:
dict_ = defaultdict(list)

for i in train_links:
    l2 = link_df[link_df["link1"]==i]["link2"]
    visit = link_df[link_df["link1"]==i]["visit"]
    for j, (l2, v) in enumerate(zip(l2, visit)):
        dict_["link1"].append(i)
        dict_["link2"].append(l2)
        dict_["visit"].append(v)     
train_df = pd.DataFrame.from_dict(dict_)
train_df

Unnamed: 0,link1,link2,visit
0,5ddc2ce62961cc24bf00cc13,5e501b5ff3fb2a9df936da88,16
1,5ddc2ce62961cc24bf00cc13,5e5021f9f3fb2a9df936e4ce,4
2,5ddc2ce62961cc24bf00cc13,5d4afe1eb58182000db71723,2
3,5ddc2ce62961cc24bf00cc13,5d9cf3b96ad6fb21399ac7c8,2
4,5ceeea9e2102d07b1f75d98e,5ceeeaa02102d07b1f75da2a,5
...,...,...,...
156779,5e973a16818be7003da8987a,5e28993f245be46e19a862f8,2
156780,5e973a16818be7003da8987a,5ceeecc3f6610e04a9091ac2,2
156781,5e973a16818be7003da8987a,5ddc2ce62961cc24bf00cc15,2
156782,5e973a16818be7003da8987a,5ceeecc3f6610e04a9091ad8,2


In [78]:
dict_ = defaultdict(list)

for i in test_links:
    l2 = link_df[link_df["link1"]==i]["link2"]
    visit = link_df[link_df["link1"]==i]["visit"]
    for j, (l2, v) in enumerate(zip(l2, visit)):
        dict_["link1"].append(i)
        dict_["link2"].append(l2)
        dict_["visit"].append(v)    
test_df = pd.DataFrame.from_dict(dict_)
test_df

Unnamed: 0,link1,link2,visit
0,5ddc2ce82961cc24bf00cc65,5fca8edd77a23d004b756717,4
1,5ceeeaa02102d07b1f75da34,5ceeeaa02102d07b1f75da01,4
2,5ceeeaa02102d07b1f75da34,5ceeea9e2102d07b1f75d959,3
3,5ceeeaa02102d07b1f75da34,5ceeeaa12102d07b1f75da78,3
4,5ceeeaa02102d07b1f75da34,5ceeeaa22102d07b1f75daab,3
...,...,...,...
33897,5df3e978ac70c8d478e7e9e9,5ceeecc3f6610e04a9091afc,2
33898,5e0fbc83faa4a8dd50bf3bb1,5df29f07ac70c8d478e7debc,4
33899,5e0fbc83faa4a8dd50bf3bb1,5ceeecc3f6610e04a9091b03,3
33900,5e0fbc83faa4a8dd50bf3bb1,5ceeecc3f6610e04a9091a5e,2


In [79]:
assert set(train_df["link1"]) != set(test_df["link1"])

In [84]:
i=5
train_lookup = {id:i for i, id in enumerate(train)}

train_link_data = generate_link_data(train, all_ids, i, train_df, train_lookup, id2index)

print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Topic distribution: {compute_ratios(train_link_data, train_cs)}
TF-IDF: {compute_ratios(train_link_data, train_tf)}
""")

(13024, 16281)
Number of records processed : 4767

filter condition: visit count > 5
Ratio of visit to non visit:
Topic distribution: 1.1785510766180431
TF-IDF: 2.3204042911456417



In [85]:
i=5
test_lookup = {id:i for i, id in enumerate(test)}

test_link_data = generate_link_data(test, all_ids, i, test_df, test_lookup, id2index)
print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Topic distribution: {compute_ratios(test_link_data, test_cs)}
TF-IDF: {compute_ratios(test_link_data, test_tf)}
""")

(3257, 16281)
Number of records processed : 1365

filter condition: visit count > 5
Ratio of visit to non visit:
Topic distribution: 1.2579866110610374
TF-IDF: 2.3235231681007384



In [86]:
i=0
train_lookup = {id:i for i, id in enumerate(train)}

train_link_data = generate_link_data(train, all_ids, i, train_df, train_lookup, id2index)

print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Topic distribution: {compute_ratios(train_link_data, train_cs)}
TF-IDF: {compute_ratios(train_link_data, train_tf)}
""")

(13024, 16281)
Number of records processed : 156784

filter condition: visit count > 0
Ratio of visit to non visit:
Topic distribution: 1.4147146175431986
TF-IDF: 1.1535592876764207



In [87]:
i=0
# test_lookup = {title:i for i, title in enumerate(test)}
test_lookup = {id:i for i, id in enumerate(test)}

test_link_data = generate_link_data(test, all_ids, i, test_df, test_lookup, id2index)
print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Topic distribution: {compute_ratios(test_link_data, test_cs)}
TF-IDF: {compute_ratios(test_link_data, test_tf)}
""")


(3257, 16281)
Number of records processed : 33902

filter condition: visit count > 0
Ratio of visit to non visit:
Topic distribution: 1.3395919218947
TF-IDF: 1.2363551866243445

