Evaluations of Sentence Transformers model, TF-IDF model and Gensim LDA topic model representation on Link dataset
--

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!cp /content/gdrive/MyDrive/indeed/import_lib_n_functions.py .
!cp /content/gdrive/MyDrive/indeed/pre_process.py .

Importing Libraries
--

In [5]:
from pre_process import *

Loading Libraries...Just a moment please!...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Path for the Source data files
--

In [6]:
path = "/content/gdrive/MyDrive/Career Guide Public Content/"

Evaluation specific variable declaration
--

In [7]:
#Loading the data
art_train = load_data(f"{path}article.json")
art_test = load_data(f"{path}article_20210503_to_20210525.json")
all_article = pd.concat([art_train, art_test])
cp_train = load_data(f"{path}careerpathpage.json")
cl_train = load_data(f"{path}coverletter.json")
res_train = load_data(f"{path}resumesamplepage.json")

#Creating content title column which is a combination of title and content
all_article["cont_title"] = all_article["contentTitle"] + " " +all_article["content"]
art_test["cont_title"] = art_test["contentTitle"] + " " +art_test["content"]
cp_train["cont_title"] = cp_train["h1"] + " " + cp_train["primaryContent"]
cl_train["cont_title"] = cl_train["title"] + " " + cl_train["contentA"] + " " +cl_train["contentB"]
res_train["cont_title"] = res_train["title"] + " " + res_train["contentA"] +" "+res_train["contentB"]

#Removing hyperlinks and html tags from the corpus
rltd_arts, all_article = get_corpus(all_article, "cont_title")
art_test = remove_hyperlinks_html_tags(art_test, "cont_title")
cp_train = remove_hyperlinks_html_tags(cp_train, "cont_title")
cl_train = remove_hyperlinks_html_tags(cl_train, "cont_title")
res_train  = remove_hyperlinks_html_tags(res_train, "cont_title")

In [8]:
#Creating a list of cont_title, titles and url_routes
cont = all_article["cont_title"].tolist() + cp_train["cont_title"].tolist() + cl_train["cont_title"].tolist() + res_train["cont_title"].tolist()
all_title = all_article["contentTitle"].tolist() + cp_train["h1"].tolist() + cl_train["title"].tolist() + res_train["title"].tolist()
url_route = all_article["urlRoute"].tolist() + cp_train["urlRoute"].tolist() + cl_train["urlRoute"].tolist() + res_train["urlRoute"].tolist()

#Getting all document ids
art_train_ids = [id_["$oid"] for id_ in art_train["_id"].tolist()]
art_test_ids = [id_["$oid"] for id_ in art_test["_id"].tolist()]
cp_train_ids = [id_["$oid"] for id_ in cp_train["_id"].tolist()]
cl_train_ids = [id_["$oid"] for id_ in cl_train["_id"].tolist()]
res_train_ids = [id_["$oid"] for id_ in res_train["_id"].tolist()]
all_ids = art_train_ids + art_test_ids + cp_train_ids + cl_train_ids + res_train_ids

#We are concatenating category and url column for loading link data and avoiding any duplicates 
all_article["cat_url"] = all_article["category"]+ "|" + all_article["urlRoute"] 
cp_train["cat_url"] = "careers|" + cp_train["urlRoute"]
cl_train["cat_url"] = "career-advice|" + cl_train["urlRoute"]
res_train["cat_url"] = res_train["category"]+ "|" + res_train["urlRoute"] 

#Creating a list of all category url data
cat_url = all_article["cat_url"].tolist() + cp_train["cat_url"].tolist() + cl_train["cat_url"].tolist() + res_train["cat_url"].tolist()

#Creating respective dictionary required for processing the link data
url2i_lookup = {url:i for i, url in enumerate(cat_url)}
id2index = {id:i for i, id in enumerate(all_ids)}
id2index_rev = {i:id for i, id in enumerate(all_ids)}
id2title = {id:title for id, title in zip(all_ids, all_title)}

In [9]:
assert len(set(all_ids)) == len(set(cat_url)) == len(set(url2i_lookup)) == len(set(id2title))

In [10]:
print("# of related articles:", len(rltd_arts))

# of related articles: 14862


Perform Sentence transformer encoding
--

In [12]:
embedding_model = "msmarco-MiniLM-L-12-v3"
device = "cuda" if torch.cuda.is_available else "cpu"
sent_embeddings = Sentence_Encoder(embedding_model).embed_documents(cont, device=device, verbose=True)

Load Link Data
--

In [13]:
all_files = [f'{path}Pageview_matrix_20210511.csv']
print(f"Link files loaded to the system are:\n {all_files[0]}")

linkdf = pd.read_csv(all_files[0], index_col=None, header=0)
link1 = linkdf["link_1"].apply(lambda x: "/".join(x.split("/")[-2:]))
link2 = linkdf["link_2"].apply(lambda x: "/".join(x.split("/")[-2:]))
visit = linkdf["visitor_count"]

linkdf = pd.DataFrame(link1)
linkdf["link_2"] = link2
linkdf["visitor_count"] = visit

Link files loaded to the system are:
 /content/gdrive/MyDrive/Career Guide Public Content/Pageview_matrix_20210511.csv


In [14]:
def link_csv(df, type="ids"):
    """
    Transforming links in the link dataframe to title names
    """
    links1 = []
    links2 = []    
    link_dict = defaultdict(list)
    for i, (l1, l2, v) in enumerate(zip(df["link_1"], df["link_2"], df["visitor_count"])):
        if i % 1 == 0:
            print(f"Number of records processed : {i+1}", end="\r")  
        l1 = "|".join(l1.split("/")[-2:])
        l2 = "|".join(l2.split("/")[-2:])
        if l1 in cat_url and l2 in cat_url:
            index = url2i_lookup[l1]
            link_dict["link1"].append(id2index_rev[index]) if type == "ids" else link_dict["link1"].append(id2title[id2index_rev[index]])

            index = url2i_lookup[l2]            
            link_dict["link2"].append(id2index_rev[index]) if type == "ids" else link_dict["link2"].append(id2title[id2index_rev[index]])
            link_dict["visit"].append(int(v))                    
        else:
            continue
    return link_dict

In [15]:
print("Load Link file ids:\n")
final_dict = link_csv(linkdf, "ids")

Load Link file ids:



In [16]:
print("Load Link file titles:\n")
final_dict_ = link_csv(linkdf, "title")

Load Link file titles:



Link dataframe by id's
--

In [17]:
link_df = pd.DataFrame.from_dict(final_dict)
link_df

Unnamed: 0,link1,link2,visit
0,5ceeecc3f6610e04a9091b36,5ceeecc3f6610e04a9091a90,2083
1,5ceeecc3f6610e04a9091b1f,5ceeecc3f6610e04a9091ad8,428
2,5ceeecc3f6610e04a9091b36,5ceeecc3f6610e04a9091ac2,322
3,5ceeecc3f6610e04a9091b1f,5ceeecc3f6610e04a9091a90,307
4,5ceeecc3f6610e04a9091b1f,5ceeecc3f6610e04a9091ac2,295
...,...,...,...
190681,5ceeecc3f6610e04a9091a6a,5e68fba1ca80a8004009e2e7,2
190682,60944b81bd2003003b6cbac1,6094460cbd2003003b6cb96c,2
190683,60944783bd2003003b6cb9d6,6094475b01df0700449da41f,2
190684,5e0fbce6faa4a8dd50bf3ca9,5e0fbbb2faa4a8dd50bf3990,2


Link dataframe by titles
--

In [18]:
link_df_titles = pd.DataFrame.from_dict(final_dict_)
link_df_titles

Unnamed: 0,link1,link2,visit
0,List of Weaknesses: 10 Things To Say in an Interview,39 Strengths and Weaknesses to Discuss in a Job Interview,2083
1,125 Common Interview Questions and Answers (With Tips),21 Job Interview Tips: How To Make a Great Impression,428
2,List of Weaknesses: 10 Things To Say in an Interview,"How to Answer ""Tell Me About Yourself"" (Tips and Example Answers)",322
3,125 Common Interview Questions and Answers (With Tips),39 Strengths and Weaknesses to Discuss in a Job Interview,307
4,125 Common Interview Questions and Answers (With Tips),"How to Answer ""Tell Me About Yourself"" (Tips and Example Answers)",295
...,...,...,...
190681,"Interview Question: ""What are You Passionate About?""",10 Closing Statements to Use After an Interview,2
190682,How to Get an Undergraduate Law Internship,8 Steps To Write a Daily Construction Report (With Tips),2
190683,How To Become Construction Estimator in 3 Steps,How To Become an Art Therapist (With 7 Steps),2
190684,How To Write an Application Letter (With Examples),Self-Introduction Tips and Tricks (with Examples),2


Perform Cosine similarity for Sentence transformer neural model embeddings
--

In [19]:
def calc_sim(embeddings):
    """
    Calculate Cosine Similarity for the embeddings recieved
    """
    cosine_scores = cosine_similarity(embeddings, embeddings)
    return cosine_scores

sent_cosine_scores = calc_sim(sent_embeddings)

Perform TF-IDF and calculate Cosine similarity for TF-IDF embeddings
--

In [20]:
tfidf = TfidfVectorizer()
vecs = tfidf.fit_transform(cont)
tfidf_cosine_score = calc_sim(vecs)

Generate Link data using function
--

In [21]:
def generate_link_data(title, all_title, filter_cond, df, lookup1, lookup2):
    """
    Generate link data in matrix for vectorization
    """    
    link_data = np.zeros((len(title),len(all_title)))
    print(link_data.shape)
    df = df[df["visit"]>filter_cond]
    for i, (s1, s2, v) in enumerate(zip(df["link1"], df["link2"], df["visit"])):
        if i % 1 == 0:
            print(f"Number of records processed : {i+1}", end="\r")  
        link_data[lookup1[s1]][lookup2[s2]]=1
    return link_data

Function for computing the ratios
--

In [22]:
def compute_ratios(data, cosine_scores):
    """
    Compute Visit to non-visit ratios
    """
    visit = cosine_scores * data
    visit_avg = np.sum(visit)/np.sum(data)
    data_inv = 1 - data
    non_visit = cosine_scores * data_inv
    non_visit_avg = np.sum(non_visit)/np.sum(data_inv)

    return visit_avg/non_visit_avg

Randomly allocating article titles to train and test dataset
--

In [23]:
np.random.seed(seed=42)

np.random.shuffle(all_ids)

train = all_ids[:int(len(all_ids)*0.8)]
test = all_ids[int(len(all_ids)*0.8):]

print(f"Number of ids allocated to train: {len(train)}")
print(f"Number of ids allocated to test: {len(test)}")

Number of ids allocated to train: 13024
Number of ids allocated to test: 3257


Checking the overlapping articles with Link data as Link data does not have the complete set of articles
--

In [24]:
train_links = set(link_df["link1"]) & set(train)
test_links = set(link_df["link1"]) & set(test)

print(f"Number of train title in link dataframe: {len(train_links)}")
print(f"Number of test title in link dataframe: {len(test_links)}")

Number of train title in link dataframe: 2975
Number of test title in link dataframe: 707


In [25]:
assert len(train_links) + len(test_links) == len(set(link_df["link1"]) & set(train)) + len(set(link_df["link1"]) & set(test))

Allocating Cosine Similarity and TF-IDF scores according to Train dataset
--

In [73]:
tcs = np.array([id2index[t]for t in train])
print(f"Train ids : {tcs}")
assert tcs.shape[0] == len(train)

train_td_cs = np_rep[tcs]
print(f"Train Topic distribution cosine scores shape: {train_td_cs.shape}")
assert tcs.shape[0] == train_td_cs.shape[0]

train_cs = sent_cosine_scores[tcs]
print(f"Train Sentence Transformer cosine scores shape: {train_cs.shape}")
assert tcs.shape[0] == train_cs.shape[0]

train_tf = tfidf_cosine_score[tcs]
print(f"Train TF-IDF scores shape: {train_tf.shape}")
assert tcs.shape[0] == train_cs.shape[0]

Train ids : [13633  1921 12140 ... 15436 15768  2376]
Train Topic distribution cosine scores shape: (13024, 16281)
Train Sentence Transformer cosine scores shape: (13024, 16281)
Train TF-IDF scores shape: (13024, 16281)


Allocating Cosine Similarity and TF-IDF scores according to Test dataset
--

In [74]:
tcs = np.array([id2index[t]for t in test])
print(f"Test ids : {tcs}")
assert tcs.shape[0] == len(test)

test_td_cs = np_rep[tcs]
print(f"Test Topic distribution cosine scores shape: {test_td_cs.shape}")
assert tcs.shape[0] == test_td_cs.shape[0]

test_cs = sent_cosine_scores[tcs]
print(f"Test cosine scores shape: {test_cs.shape}")
assert tcs.shape[0] == test_cs.shape[0]

test_tf = tfidf_cosine_score[tcs]
print(f"Test TF-IDF scores shape: {test_tf.shape}")
assert tcs.shape[0] == test_tf.shape[0]

Test ids : [ 8551  3839 15427 ...   860 15795  7270]
Test Topic distribution cosine scores shape: (3257, 16281)
Test cosine scores shape: (3257, 16281)
Test TF-IDF scores shape: (3257, 16281)


Create Train and Test dictionary to view the visitor count against each document id
--

In [29]:
def create_train_n_test_dictionary(links):
    dict_ = defaultdict(list)    
    for i in links:
        l2 = link_df[link_df["link1"]==i]["link2"]
        visit = link_df[link_df["link1"]==i]["visit"]
        for j, (l2, v) in enumerate(zip(l2, visit)):
            dict_["link1"].append(i)
            dict_["link2"].append(l2)
            dict_["visit"].append(v)   
    return dict_

In [30]:
dict_ = create_train_n_test_dictionary(train_links)
train_df = pd.DataFrame.from_dict(dict_)
train_df

Unnamed: 0,link1,link2,visit
0,5df11e27ac70c8d478e7dc3b,5df11e28ac70c8d478e7dc81,2
1,5df11e27ac70c8d478e7dc3b,5df11e26ac70c8d478e7dbb9,2
2,5df11e27ac70c8d478e7dc3b,5df11e28ac70c8d478e7dc4c,2
3,5df11e27ac70c8d478e7dc3b,5df11e4aac70c8d478e7dd36,2
4,5e0fbc61faa4a8dd50bf3b5c,5ceeecc3f6610e04a9091b13,18
...,...,...,...
156779,601c06510f45990044676115,5df11e27ac70c8d478e7dc3e,2
156780,601c06510f45990044676115,5df11e28ac70c8d478e7dc90,2
156781,601c06510f45990044676115,5e971a77818be7003da8948a,2
156782,601c06510f45990044676115,5fa339d602425d004adbbcd0,2


In [31]:
dict_ = create_train_n_test_dictionary(test_links)
test_df = pd.DataFrame.from_dict(dict_)
test_df

Unnamed: 0,link1,link2,visit
0,5e0fbd28faa4a8dd50bf3d53,5d9cf3ba6ad6fb21399ac7de,2
1,5ceeeaa02102d07b1f75da0e,5ceeea9f2102d07b1f75d9e1,5
2,5ceeeaa02102d07b1f75da0e,5ceeeaa02102d07b1f75d9f0,4
3,5ceeeaa02102d07b1f75da0e,5ceeeaa22102d07b1f75da97,4
4,5ceeeaa02102d07b1f75da0e,5ceeeaa02102d07b1f75d9fd,3
...,...,...,...
33897,60944c8997a70a004864f510,609445a04f2070004708c049,2
33898,60944c8997a70a004864f510,609448cd97a70a004864f43a,2
33899,60944c8997a70a004864f510,609449c401df0700449da4c4,2
33900,60944c8997a70a004864f510,60944a1e4b7dd8004874dfb5,2


In [32]:
assert set(train_df["link1"]) != set(test_df["link1"])

Computing averge visit to non-visit ratio for Train dataset
--

In [34]:
i=5
train_lookup = {id:i for i, id in enumerate(train)}

train_link_data = generate_link_data(train, all_ids, i, train_df, train_lookup, id2index)
print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Sentence Transformers: {compute_ratios(train_link_data, train_cs)}
TF-IDF: {compute_ratios(train_link_data, train_tf)}
""")

(13024, 16281)


filter condition: visit count > 5
Ratio of visit to non visit:
Sentence Transformers: 2.395377987870224
TF-IDF: 2.331400598354878



Computing averge visit to non-visit ratio for Test dataset
--

In [35]:
i=5
test_lookup = {id:i for i, id in enumerate(test)}

test_link_data = generate_link_data(test, all_ids, i, test_df, test_lookup, id2index)
#Gensim LDA topic modelling: {compute_ratios(test_link_data, test_td_cs)}
print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Sentence Transformers: {compute_ratios(test_link_data, test_cs)}
TF-IDF: {compute_ratios(test_link_data, test_tf)}
""")

(3257, 16281)
Number of records processed : 1365

filter condition: visit count > 5
Ratio of visit to non visit:
Sentence Transformers: 2.3925465744957153
TF-IDF: 2.3483899753857873



**As expected, Sentence Transformers embeddings are more semantically coherent than TF-IDF. But the TF-IDF scores are high as well seems to be something wrong with the link data, therefore have to look for visitor count less than where we could find bad examples.**

Some of the examples for visitor count less than 5 having urelated article link pairs
--

In [36]:
link_df_titles[link_df_titles["visit"]<5].iloc[6:].head(30)

Unnamed: 0,link1,link2,visit
8600,Digital Marketing Specialist,Supply Chain Manager,4
8601,The Five Step Nursing Process,How To Write an Action Plan To Achieve Your Goals,4
8602,5 Common Interview Questions About Conflict (With Example Answers),8 Team-Building Activities for Improving Communication (With Tips),4
8603,How to Answer “What Motivates You?” (With Examples),6 Examples of Critical Thinking Skills,4
8604,12 Tough Interview Questions and Answers,4 Ways To Improve Organizational Climate,4
8605,31 Tips for Working in a Call Center,Learn About Being an Accounting Assistant,4
8606,Leadership Skills: Definitions and Examples,"Functional Resume: Definition, Tips and Examples",4
8607,Journeyman Electrician,Quality Assurance Analyst,4
8608,10 Key Areas of Development for Employees (with Examples and Tips),How to Write a Self-Appraisal,4
8609,Medical Assistant Vs. Nurse: Differences and Similarities,9 Ways To Improve Your Personal Development Skills,4


Evaluations of Representations of New Articles with Reduced Dimensionality used for clustering
--

In [37]:
#Using neighbors=15 and n_components=2
np.random.seed(seed=42)
umap_model = umap.UMAP(n_neighbors=15,
                       n_components=2,
                       metric='cosine')

umap_data = umap_model.fit_transform(sent_embeddings)

In [38]:
umap_data.shape

(16281, 2)

In [39]:
test_ids = np.array([id2index[t] for t in test])
train_ids = np.array([id2index[t] for t in train])
train_embeddings = umap_data[train_ids]
test_embeddings = umap_data[test_ids]

In [40]:
train_embeddings.shape, test_embeddings.shape

((13024, 2), (3257, 2))

In [41]:
umap_scores = calc_sim(umap_data)

In [42]:
train_cosine_scores = umap_scores[train_ids, :]
test_cosine_scores = umap_scores[test_ids, :]

In [43]:
train_cosine_scores.shape, test_cosine_scores.shape 

((13024, 16281), (3257, 16281))

In [79]:
i=5
train_lookup = {id:i for i, id in enumerate(train)}

train_link_data = generate_link_data(train, all_ids, i, train_df, train_lookup, id2index)

print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Sentence Transformers: {compute_ratios(train_link_data, train_cosine_scores)}
TF-IDF: {compute_ratios(train_link_data, train_tf)}
""")

(13024, 16281)


filter condition: visit count > 5
Ratio of visit to non visit:
Sentence Transformers: 2.747890680721183
TF-IDF: 2.331400598354878



In [80]:
i=5
test_lookup = {id:i for i, id in enumerate(test)}

test_link_data = generate_link_data(test, all_ids, i, test_df, test_lookup, id2index)
print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Sentence Transformers: {compute_ratios(test_link_data, test_cosine_scores)}
TF-IDF: {compute_ratios(test_link_data, test_tf)}
""")

(3257, 16281)
Number of records processed : 1365

filter condition: visit count > 5
Ratio of visit to non visit:
Sentence Transformers: 2.557969728604866
TF-IDF: 2.3483899753857873



**Sentence transformers embeddings are doing comparitively better than TF-IDF embeddings and sentence transformers embeddings even after dimensionality reduction performs better  as UMAP seems to maintain the local structure of the embeddings at reduces dimension.**

Gensim Lda Topic Distribution Embedding
--

In [46]:
f = open(f"{path}article_20210503_to_20210525.json", encoding="utf-8")
test_train = []
for line in f:
    test_train.append(json.loads(line))
test_train = pd.DataFrame(pd.DataFrame(test_train))

In [47]:
rltd_arts, art_train = get_corpus(art_train, "content")
cl_train = remove_hyperlinks_html_tags(cl_train, "contentA")
res_train  = remove_hyperlinks_html_tags(res_train, "contentA")

In [48]:
cp_train  = remove_hyperlinks_html_tags(cp_train, "primaryContent")

In [49]:
from gensim.utils import simple_preprocess
def strip_newline(series):
    """
    stripping out new line
    """                    
    return [review.replace('\n','') for review in series]
def sent_to_words(sentences):
    """
    Converting sentence to words
    """                
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    """
    Used for removing stopwords
    """            
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    """
    Find the bigrams in the content of the article
    """        
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_bigram(df):
    if 'content' in df:
        df_content = strip_newline(df.content)
    elif 'contentA' in df:
        df_content = df['contentA'] + ' ' + df['contentB']
    elif 'primaryContent' in df:
        df_content = df['primaryContent']
    words = list(sent_to_words(df_content))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram

In [50]:
art_bigram = get_bigram(art_train)
cp_bigram = get_bigram(cp_train)
cl_bigram = get_bigram(cl_train)
res_bigram = get_bigram(res_train)

In [51]:
# prediction on new articles
test_art_bigram = get_bigram(test_train)

### Gensim LDA model

In [52]:
from gensim.corpora.dictionary import Dictionary
train_dict = Dictionary(art_bigram)
train_corpus = [train_dict.doc2bow(text) for text in art_bigram]
lda_30 = gensim.models.LdaModel(train_corpus,
                            num_topics=30,
                            id2word=train_dict,
                            chunksize=100,
                            random_state=10,
                            passes=20,
                            alpha=0.01)

In [53]:
cp_corpus = [train_dict.doc2bow(text) for text in cp_bigram]
cl_corpus = [train_dict.doc2bow(text) for text in cl_bigram]
res_corpus = [train_dict.doc2bow(text) for text in res_bigram]

In [54]:
# prediction on new articles
test_corpus = [train_dict.doc2bow(text) for text in test_art_bigram]

### Embedding

In [55]:
urlRoute_set = set()
urlRoute_list = []
for urlRoute in art_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in cp_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in cl_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in res_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)
for urlRoute in test_train['urlRoute'].tolist():
    urlRoute_set.add(urlRoute)
    urlRoute_list.append(urlRoute)

We used the best model from above to generate the document distributions over the 30 topics. Frist thing we need to do is to pad those topics with 0 probability, so that the matrix we come up with will have consistant dimension.

In [56]:
# Padding the document distributions over all the (30) topics
def padding_matrix(model, train_corpus):
    padding_dist_matrix = []
    for i, dist in enumerate(model.get_document_topics(train_corpus)):
        padding_dist_row = []
        if len(dist) < 30:
            dist_id = [topic_id for topic_id, prob in dist]
            dist_id_prob_dict = {topic_id:prob for topic_id, prob in dist}
            for i in range(30):
                if i not in dist_id:
                    padding_dist_row.append(0)
                elif i in dist_id:
                    padding_dist_row.append(dist_id_prob_dict[i])
            padding_dist_matrix.append(padding_dist_row)
        else:
            prob_list = [prob for topic_id, prob in dist]
            padding_dist_matrix.append(prob_list)
    return padding_dist_matrix

In [57]:
art_padding_matrix = padding_matrix(lda_30, train_corpus)
cp_padding_matrix = padding_matrix(lda_30, cp_corpus)
cl_padding_matrix = padding_matrix(lda_30, cl_corpus)
res_padding_matrix = padding_matrix(lda_30, res_corpus)

In [58]:
# prediction on new articles
test_padding_matrix = padding_matrix(lda_30, test_corpus)

In [59]:
general_urlRoute = [art_train['urlRoute'].tolist(), cp_train['urlRoute'].tolist(), cl_train['urlRoute'].tolist(), res_train['urlRoute'].tolist(), test_train['urlRoute'].tolist()]
general_embedding_dict = [art_padding_matrix, cp_padding_matrix, cl_padding_matrix, res_padding_matrix, test_padding_matrix]

In [60]:
urlRoute_embedding_dict = defaultdict(list)
count = 0
for i, matrix in enumerate(general_embedding_dict):
    for j, doc_embedding in enumerate(matrix):
        urlRoute_embedding_dict[general_urlRoute[i][j]].append(general_embedding_dict[i][j])

In [61]:
from tqdm import tqdm
import sklearn

In [62]:
urlRoute_embeddings = defaultdict(list)
count = 0
for urlRoute in urlRoute_list:
    
    if len(urlRoute_embedding_dict[urlRoute]) == 1:
        urlRoute_embeddings[urlRoute] = urlRoute_embedding_dict[urlRoute]
        count += 1
    elif len(urlRoute_embedding_dict[urlRoute]) == 2:
        continue

In [63]:
embedding_rep = []
for embeddings in urlRoute_embedding_dict:
    row_embedding = []
    for embedding in urlRoute_embedding_dict[embeddings]:
        row_embedding.append(embedding)
    embedding_rep.extend(row_embedding)

In [64]:
embedding_rep = np.array(embedding_rep)
embedding_rep.shape

(16281, 30)

### Cosine similarity scores for topic distribution embeddings

In [65]:
np_rep = np.zeros([len(embedding_rep), len(embedding_rep)]) 
# numpy vector representation on cos_sim
for i, row in tqdm(enumerate(np_rep)):

    sent1 = embedding_rep[i]
    sent2_list = embedding_rep
    cosine_sim = sklearn.metrics.pairwise.cosine_similarity([sent1], sent2_list)
    np_rep[i] = cosine_sim[0]

16281it [00:49, 330.29it/s]


In [66]:
np_rep

array([[1.        , 0.08913597, 0.20630425, ..., 0.09824708, 0.32353552,
        0.81087554],
       [0.08913597, 1.        , 0.01117008, ..., 0.06316776, 0.22067469,
        0.13377808],
       [0.20630425, 0.01117008, 1.        , ..., 0.02399454, 0.34985838,
        0.08831937],
       ...,
       [0.09824708, 0.06316776, 0.02399454, ..., 1.        , 0.10508609,
        0.21629528],
       [0.32353552, 0.22067469, 0.34985838, ..., 0.10508609, 1.        ,
        0.45623018],
       [0.81087554, 0.13377808, 0.08831937, ..., 0.21629528, 0.45623018,
        1.        ]])

In [75]:
i=5
train_lookup = {id:i for i, id in enumerate(train)}

train_link_data = generate_link_data(train, all_ids, i, train_df, train_lookup, id2index)

print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
Gensim LDA topic modelling: {compute_ratios(train_link_data, train_td_cs)}
TF-IDF: {compute_ratios(train_link_data, train_tf)}
""")

(13024, 16281)


filter condition: visit count > 5
Ratio of visit to non visit:
Gensim LDA topic modelling: 1.2146355850895947
TF-IDF: 2.331400598354878



In [76]:
i=5
test_lookup = {id:i for i, id in enumerate(test)}

test_link_data = generate_link_data(test, all_ids, i, test_df, test_lookup, id2index)
print(f"""
\nfilter condition: visit count > {i}
Ratio of visit to non visit:
#Gensim LDA topic modelling: {compute_ratios(test_link_data, test_td_cs)}
TF-IDF: {compute_ratios(test_link_data, test_tf)}
""")

(3257, 16281)
Number of records processed : 1365

filter condition: visit count > 5
Ratio of visit to non visit:
#Gensim LDA topic modelling: 1.3232963309402246
TF-IDF: 2.3483899753857873

