In [24]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import FastText
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from huggingface_hub import hf_hub_download
from gensim.models import KeyedVectors
import fasttext
import nltk

import requests
import os
import dotenv
import base64
import json
import faiss

from app.text_preprocessing import preprocessing

nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Getting Input Text

In this project, we are using careers recommended from O*NET's API as the "query" to search for similar jobs, courses, and university programs. Thus, to evaluate the performance of each method, we will fetch and get the top 1 career recommendation from an arbitary RIASEC scores, and then map the similarity against jobs, courses, and university programs.

In [4]:
def recommend_careers(r:int, i:int, a:int, s:int, e:int, c:int, top_n:int = 3) -> dict:    
    url = f"https://services.onetcenter.org/ws/mnm/interestprofiler/careers?Realistic={r}&Investigative={i}&Artistic={a}&Social={s}&Enterprising={e}&Conventional={c}"
    
    dotenv.load_dotenv()
    headers={'User-Agent': 'python-OnetWebService/1.00 (bot)',
            'Authorization': 'Basic ' + base64.standard_b64encode((os.getenv('ONET_USERNAME') + ':' + os.getenv('ONET_PASSWORD')).encode()).decode(),
            'Accept': 'application/json' }
    
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        raise Exception(f"Error fetching data from O*NET: {r.status_code} - {r.text}")
    
    data = r.json()
    
    top_n_careers = []
    
    for career in data['career'][:top_n]:
        report_url = f"https://services.onetcenter.org/ws/mnm/careers/{career['code']}/report"
        r = requests.get(report_url, headers=headers)
        if r.status_code != 200:
            raise Exception(f"Error fetching career report: {r.status_code} - {r.text}")
        print("Successfully fetched career report for:", career['code'])
        report_data = r.json()
        code = report_data.get('career', {}).get('code', '')
        title = report_data.get('career', {}).get('title', '')
        also_called = ", ".join(report_data.get('career', {}).get('also_called', {}).get('title', []))
        what_they_do = report_data.get('career', {}).get('what_they_do', '')
        on_the_job = ", ".join(report_data.get('career', {}).get('on_the_job', {}).get('task', []))

        
        c_knowledges = []
        knowledges = report_data.get('knowledge', {}).get('group', [])

        for knowledge in knowledges:
            c_knowledges.append(knowledge['title']['name'])
            elements = knowledge['element']
            for element in elements:
                c_knowledges.append(element['name'])
        
        c_abilities = []
        abilities = report_data.get('abilities', {}).get('group', [])

        for ability in abilities:
            c_abilities.append(ability['title']['name'])
            elements = ability['element']
            for element in elements:
                c_abilities.append(element['name'])
                
        c_skills = []
        skills = report_data.get('skills', {}).get('group', [])

        for skill in skills:
            c_skills.append(skill['title']['name'])
            elements = skill['element']
            for element in elements:
                c_skills.append(element['name'])
                
        c_technologies = []
        technologies = report_data.get('technology', {}).get('category', [])

        for tech in technologies:
            c_technologies.append(tech['title']['name'])
            examples = tech['example']
            for ex in examples:
                c_technologies.append(ex['name'])

        outlook = report_data.get('job_outlook', {}).get('outlook', '')

        title_preprocessed = preprocessing(title)
        also_called_preprocessed = preprocessing(also_called)
        what_they_do_preprocessed = preprocessing(what_they_do)
        on_the_job_preprocessed = preprocessing(on_the_job)
        preprocessed_text = title_preprocessed + " " + also_called_preprocessed + " " + what_they_do_preprocessed + " " + on_the_job_preprocessed
        
        top_n_careers.append({
            'code': code,
            'title': title,
            'also_called': also_called,
            'what_they_do': what_they_do,
            'on_the_job': on_the_job,
            'knowledges': c_knowledges,
            'skills': c_skills,
            'abilities': c_abilities,
            'outlook': outlook,
            'preprocessed_text': preprocessed_text,
        })
        
    return top_n_careers

In [5]:
# Random input for RIASEC
career = recommend_careers(r=5, i=15, a=0, s=0, e=0, c=15, top_n=1)
career = career[0]

Successfully fetched career report for: 15-2011.00


In [6]:
career

{'code': '15-2011.00',
 'title': 'Actuaries',
 'also_called': 'Actuarial Analyst, Actuary, Consulting Actuary, Pricing Actuary',
 'what_they_do': 'Analyze statistical data, such as mortality, accident, sickness, disability, and retirement rates and construct probability tables to forecast risk and liability for payment of future benefits. May ascertain insurance rates required and cash reserves necessary to ensure payment of future benefits.',
 'on_the_job': 'Ascertain premium rates required and cash reserves and liabilities necessary to ensure payment of future benefits., Collaborate with programmers, underwriters, accounts, claims experts, and senior management to help companies develop plans for new lines of business or improvements to existing business., Analyze statistical information to estimate mortality, accident, sickness, disability, and retirement rates.',
 'knowledges': ['Math and Science',
  'arithmetic, algebra, geometry, calculus, or statistics',
  'Engineering and Techn

In [25]:
# BoW
def bow_embeddings(corpus):
    count_vectorizer = CountVectorizer()
    embeddings_bow = count_vectorizer.fit_transform(corpus)
    return embeddings_bow, count_vectorizer

# TF-IDF
def tfidf_embeddings(corpus):
    tfidf_vectorizer = TfidfVectorizer()
    embeddings_tfidf = tfidf_vectorizer.fit_transform(corpus)
    return embeddings_tfidf, tfidf_vectorizer

# Word2Vec
def word2vec_model(corpus, save_path):
    kv = KeyedVectors.load_word2vec_format("wiki-news-300d-1M.vec")
    
    # only retrieving necessary tokens, to save memory
    all_tokens = set()
    
    for text in corpus:
        tokens = nltk.word_tokenize(text)
        all_tokens.update(tokens)
        
    filtered_tokens = [token for token in all_tokens if token in kv]
    
    kv_subset = KeyedVectors(vector_size=kv.vector_size)
    kv_subset.add_vectors(filtered_tokens, [kv[token] for token in filtered_tokens])
    
    kv.save(save_path)
    
    return kv_subset
    
# FastText
def fasttext_model(corpus, save_path):
    model = FastText(corpus, vector_size=100, window=5, min_count=1, workers=-1)
    model.wv.save(save_path)
    
    return model.wv

    # Wanted to use this, but not enough RAM
    
    # model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
    # ft_model = fasttext.load_model(model_path)
    
    # vector_size = ft_model.get_dimension()
    # kv = KeyedVectors(vector_size=vector_size)
    
    # all_tokens = set()
    
    # for text in corpus:
    #     tokens = nltk.word_tokenize(text)
    #     all_tokens.update(tokens)

    # word_vectors = []
    # words_in_model = []
    # for token in all_tokens:
    #     try:
    #         word_vectors.append(ft_model.get_word_vector(token))
    #         words_in_model.append(token)
    #     except:
    #         pass

    # kv.add_vectors(words_in_model, word_vectors)
    # return kv

# Sentence Transformers
def sentence_transformers_embeddings(corpus, save_model_path, save_index_path):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    model.save(save_model_path)
    embeddings = model.encode(corpus)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    faiss.write_index(index, save_index_path)
    return embeddings, model

We will simulate user's input on major, interests, and skills

In [None]:
# Because the result is actuary, let's simulate that the user's major is related to finance and interested in finance & math

major = "Actuarial Science"
interests = "Financial risk analysis\nInsurance modeling\nMathematical modeling in finance\nForecasting and simulation\nPension & retirement planning\nData-driven decision making"
skills = "Statistical analysis\nRisk assessment\nFinancial modeling\nData interpretation\nProblem-solving\nAttention to detail"

major = preprocessing(major)
interests = preprocessing(interests)
skills = preprocessing(skills)

text = f"{career['preprocessed_text']} {major} {interests} {skills}"

In [20]:
text

# This is the text that we will use to find similar jobs and courses

'actuary actuarial analyst actuary consulting actuary pricing actuary analyze statistical data mortality accident sickness disability retirement rate construct probability table forecast risk liability payment future benefit may ascertain insurance rate required cash reserve necessary ensure payment future benefit ascertain premium rate required cash reserve liability necessary ensure payment future benefit collaborate programmer underwriter account claim expert senior management help company develop plan new line business improvement existing business analyze statistical information estimate mortality accident sickness disability retirement rate actuarial science financial risk analysis insurance modeling mathematical modeling finance forecasting simulation pension retirement planning datadriven decision making statistical analysis risk assessment financial modeling data interpretation problemsolving attention detail'

In [10]:
with open("preprocessed/edx_courses.json", 'r', encoding='utf-8') as f:
        courses_data= json.load(f)
        
with open("preprocessed/linkedin_jobs.json", 'r', encoding='utf-8') as f:
        jobs_data= json.load(f)
        
courses_text = [course['text'] for course in courses_data]
jobs_text = [job['text'] for job in jobs_data]

# BoW x Cosine

In [11]:
corpus_embeddings, vectorizer = bow_embeddings(courses_text)
# to reverse the cos similarity value, so that 0 is the most similar
dist = 1-cosine_similarity(vectorizer.transform([text]), corpus_embeddings).flatten()
bow_cos_dist = pd.DataFrame(courses_data).iloc[np.argsort(dist), :]
bow_cos_dist.loc[:,['cos_dist']] = np.sort(dist)
bow_cos_dist[:11]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,0.686791
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.704315
486,Statistical Inference and Modeling for High-th...,[Harvard University],A focus on the techniques commonly used to per...,\nOrganizing high throughput data\nMultiple co...,In this course you’ll learn various statistics...,[Available now],"[Data Analysis & Statistics, Biology & Life Sc...",[Intermediate],[English],Course,"[XSeries, Professional Certificate]","[michael-love, rafael-irizarry]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Hindi, Telugu, Indonesian, Swahili, P...",915,https://www.edx.org/learn/statistics/harvard-u...,4.0,"[Life Sciences, Matrix Algebra, Exploratory Da...",statistical inference modeling highthroughput ...,0.708139
1282,Essential quantitative business skills,[Tecnológico de Monterrey],Gain a solid foundation of essential quantitat...,"In today's work environment, it is vital for p...",You will develop your capacity for quantitativ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Program,[Professional Certificate],"[ivan-adolfo-valdovinos-hernandez, jose-antoni...",[],[],909,https://www.edx.org/certificates/professional-...,,"[Financial Modeling, Data Science, Finance, Fi...",essential quantitative business skill today wo...,0.710327
415,Risk Management and Credit Principles,[New York Institute of Finance],"Get introduced to risk management, credit prin...",\nIntroduction to Risk Management and Credit P...,This course is Part 1 of the Credit and Credit...,[Available now],[Economics & Finance],[Advanced],[English],Course,"[Professional Certificate, Professional Certif...",[tracy-williams],"[Arabic, English, Spanish (Latin America), Ind...","[German, Spanish, Swahili, Portuguese - Brazil...",1137,https://www.edx.org/learn/risk-management/new-...,6.0,"[Credit Risk, Credit Analysis, Loss Given Defa...",risk management credit principle introduction ...,0.740077
752,Introduction to Data Science and Basic Statist...,[Tecnológico de Monterrey],In this course you will acquire statistical me...,"Through this course, participants will be able...",This course allows you to develop skills of a ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Course,[Professional Certificate],[jose-antonio-nunez-mora],"[Arabic, English, Spanish (Latin America), Ind...",[],517,https://www.edx.org/learn/data-analysis/tecnol...,4.0,"[Probability Distribution, Decision Making, Da...",introduction data science basic statistic busi...,0.740102
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.740977
317,Financial Analysis for Decision Making,[Babson College],Learn how to analyze business opportunities fo...,\nTools and techniques for funding a growing b...,How do you find the money necessary to effecti...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Course,"[XSeries, Professional Certificate, Profession...",[mark-potter],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Portuguese - Brazil, Indones...",1659,https://www.edx.org/learn/financial-analysis/b...,4.0,"[Financial Analysis, Securities (Finance), Fin...",financial analysis decision making tool techni...,0.746832
383,Analytics for Decision Making,[Babson College],Discover the foundational concepts that suppor...,\nVariability in the real world and implicatio...,Want to know how to avoid bad decisions with d...,[Available now],"[Business & Management, Data Analysis & Statis...",[Introductory],[English],Course,"[XSeries, Professional Certificate, Profession...","[davit-khachatryan, nathan-karst, george-recck...","[Arabic, English, Spanish (Latin America), Ind...","[Spanish, Portuguese - Brazil, Arabic, Indones...",1268,https://www.edx.org/learn/decision-making/babs...,4.0,"[Statistical Thinking, Quantitative Models Of ...",analytics decision making variability real wor...,0.747009
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,0.747626


# TF-IDF x Cosine

In [12]:
corpus_embeddings, vectorizer = tfidf_embeddings(courses_text)
# to reverse the cos similarity value, so that 0 is the most similar
dist = 1-cosine_similarity(vectorizer.transform([text]), corpus_embeddings).flatten()
tfidf_cos_dist = pd.DataFrame(courses_data).iloc[np.argsort(dist), :]
tfidf_cos_dist.loc[:,['cos_dist']] = np.sort(dist)
tfidf_cos_dist[:11]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.652402
1292,Actuarial Science: Financial Math and Probability,[The University of Wisconsin-Madison],This certificate program consists of four cour...,This certificate program consists of four cour...,How to perform calculations relating to the pr...,[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Program,[Professional Certificate],[gordon-enderle-6],[],[],839,https://www.edx.org/certificates/professional-...,,"[Cash Flows, Mathematical Finance, Random Vari...",actuarial science financial math probability c...,0.736053
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.740282
1230,The FinTech: Future of Payments,[University of Toronto],Learn how financial innovations impact the glo...,"Over the last decade, the world has seen a tra...","Gain insight into the past, present, and futur...",[Available now],[Economics & Finance],[Introductory],[English],Program,[Professional Certificate],[andreas-park-3],[],[],1325,https://www.edx.org/certificates/professional-...,,"[Blockchain, Finance, Financial Technology (Fi...",fintech future payment last decade world seen ...,0.77866
486,Statistical Inference and Modeling for High-th...,[Harvard University],A focus on the techniques commonly used to per...,\nOrganizing high throughput data\nMultiple co...,In this course you’ll learn various statistics...,[Available now],"[Data Analysis & Statistics, Biology & Life Sc...",[Intermediate],[English],Course,"[XSeries, Professional Certificate]","[michael-love, rafael-irizarry]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Hindi, Telugu, Indonesian, Swahili, P...",915,https://www.edx.org/learn/statistics/harvard-u...,4.0,"[Life Sciences, Matrix Algebra, Exploratory Da...",statistical inference modeling highthroughput ...,0.8177
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,0.819472
938,Financial Analysis of Insurance Companies – In...,[New York Institute of Finance],"Take a deep dive into the operating practices,...",\nDescribe major industry trends and challenge...,Want to gain a solid understanding of the uniq...,[Available now],[Economics & Finance],[Advanced],[English],Course,[Professional Certificate],[jack-farmer],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, French, Hindi, Indonesian, Telugu, Po...",376,https://www.edx.org/learn/financial-analysis/n...,3.0,"[Generally Accepted Accounting Principles, Sol...",financial analysis insurance company industry ...,0.819794
1391,Financial Analysis of Insurance Companies,[New York Institute of Finance],,Financial Analysis of Insurance Companies Prof...,Describe major industry trends and challenges....,[Available now],"[Economics & Finance, Business & Management]",[Advanced],[English],Program,[Professional Certificate],[jack-farmer],[],[],410,https://www.edx.org/certificates/professional-...,,"[Financial Analysis, Casualty Insurance]",financial analysis insurance company financial...,0.829323
1153,Personal Finance,[Indiana University],The Personal Finance Professional Certificate ...,Create Your Financial Plan: The Personal Finan...,This certificate is part of the Brian D. Jelli...,[Available now],[Economics & Finance],[Introductory],[English],Program,[Professional Certificate],"[kenneth-carrow, todd-roberson]",[],[],3197,https://www.edx.org/certificates/professional-...,,"[Financial Services, Financial Literacy, Perso...",personal finance create financial plan persona...,0.832927
75,Foundations of Finance,[University of Cambridge],"This course provides a rigorous, but straightf...",\nMoney and Capital\nCash flows and Cash flow ...,"This course provides a rigorous, but straightf...","[Available now, Archived]",[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],[doug-williamson],[],[],8863,https://www.edx.org/learn/finance/university-o...,5.0,"[Corporate Finance, Financial Management, Prob...",foundation finance money capital cash flow cas...,0.844056


# Word2Vec x WMD

In [None]:
kv = word2vec_model(courses_text, save_path="app/models/w2v_courses.kv")

In [16]:
dists = []
for text_ in courses_text:
    # distance between two list of word
    dist = kv.wmdistance(nltk.word_tokenize(text_), nltk.word_tokenize(text))
    dists.append(dist)

w2v_wmd_dist = pd.DataFrame(courses_data).iloc[np.argsort(dists), :]
w2v_wmd_dist.loc[:,['wmd_dist']] = np.sort(dists)
w2v_wmd_dist[:11]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,wmd_dist
1282,Essential quantitative business skills,[Tecnológico de Monterrey],Gain a solid foundation of essential quantitat...,"In today's work environment, it is vital for p...",You will develop your capacity for quantitativ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Program,[Professional Certificate],"[ivan-adolfo-valdovinos-hernandez, jose-antoni...",[],[],909,https://www.edx.org/certificates/professional-...,,"[Financial Modeling, Data Science, Finance, Fi...",essential quantitative business skill today wo...,0.806005
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.810218
1146,Fundamentals of Financial Analysis,[Babson College],Learn the foundational accounting and finance ...,Making decisions based on financial data is es...,How primary financial statements are construct...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter]",[],[],3269,https://www.edx.org/certificates/professional-...,,"[Financial Statements, Financial Analysis, Fin...",fundamental financial analysis making decision...,0.825016
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,0.830947
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.843338
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,0.847774
108,Mathematical Methods for Quantitative Finance,[Massachusetts Institute of Technology],Learn the mathematical foundations essential f...,\nProbability distributions in finance\nTime-s...,Modern finance is the science of decision maki...,[Available now],"[Economics & Finance, Math, Engineering]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, paul-f-mende]","[Arabic, English, Spanish (Latin America), Ind...","[Portuguese - Brazil, Spanish, Arabic, Indones...",5782,https://www.edx.org/learn/finance/massachusett...,12.0,"[Planning, Financial Market, Chartered Financi...",mathematical method quantitative finance proba...,0.848932
850,Financial Statement Analysis: Company Forecast...,[Rice University],Strengthen your skills as an investor or an an...,You will be able to:\n\n\nAnalyze and effectiv...,Rice University’s online business courses offe...,[Available now],[Business & Management],[Introductory],[English],Course,[],[brian-akins],"[Arabic, English, Spanish (Latin America), Ind...","[Spanish, Portuguese - Brazil]",436,https://www.edx.org/learn/business-management/...,5.0,[],financial statement analysis company forecast ...,0.852087
1016,Financial Decision-Making for Leaders,[Babson College],Learn financial and quantitative analysis for ...,"Literacy in business financials, regardless of...",Practical skills for evaluating and forecastin...,[Available now],"[Business & Management, Economics & Finance, D...",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter, rick-cleary, natha...",[],[],5061,https://www.edx.org/certificates/professional-...,,"[Finance, Performance Metric, Financial Foreca...",financial decisionmaking leader literacy busin...,0.854624
296,Derivatives Markets: Advanced Modeling and Str...,[Massachusetts Institute of Technology],Financial derivatives are ubiquitous in global...,\nAdvanced derivatives pricing approaches adap...,Financial derivatives are ubiquitous in global...,[Available now],"[Economics & Finance, Data Analysis & Statistics]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, deborah-j-lucas]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Portuguese - Brazil, Indones...",1784,https://www.edx.org/learn/finance/massachusett...,12.0,"[Capital Markets, Commercial Banking, Hedge Fu...",derivative market advanced modeling strategy a...,0.85647


# Word2Vec x Cosine

Instead of calculating distance by word, we can try to calculate distance by sentence. We need to convert a list of vectors for each document into a single vector by averaging the vectors value.

In [17]:
def avg_vector(vectors:list, n_words:int) -> list:
    # Calculate the average vector from a list of vectors
    vector_sum = np.sum(vectors, axis=0)
    if n_words != 0:
        return np.divide(vector_sum, n_words)

In [21]:
corpus_vectors = []

# Create a list of vectors for each text in the corpus
for text_ in courses_text:
    tokens = nltk.word_tokenize(text_)
    vectors = []
    for token in tokens:
        if token in kv:
            vectors.append(kv[token])
        else:
            # if the token is not in the model, we use a vector of ones
            vectors.append(np.ones(kv.vector_size, dtype='float32'))
    corpus_vectors.append(vectors)

# Average the word vectors for each text
corpus_vectors = [avg_vector(vector, len(vector)) for vector in corpus_vectors]

text_tokens = nltk.word_tokenize(text)
text_vector = [avg_vector([kv[token] if token in kv else np.ones(kv.vector_size, dtype='float32') for token in text_tokens], len(text_tokens))]

# reverse the cos similarity value, so that 0 is the most similar
dist = 1-cosine_similarity(text_vector, corpus_vectors).flatten()
w2v_cos_dist = pd.DataFrame(courses_data).iloc[np.argsort(dist), :]
w2v_cos_dist.loc[:,['cos_dist']] = np.sort(dist)
w2v_cos_dist[:11]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
1049,Finance,[Massachusetts Institute of Technology],Accelerate your finance career. Fast-track you...,The skills and expertise required for a career...,"Comprehend the foundations of modern finance.,...","[Available now, Archived, Upcoming]","[Economics & Finance, Data Analysis & Statisti...",[Advanced],[English],Program,[MicroMasters],"[egor-matveyev, paul-f-mende, leonid-kogan, ji...",[],[],33254,https://www.edx.org/masters/micromasters/mitx-...,,"[Financial Analysis, Derivatives Markets, Fina...",finance skill expertise required career financ...,0.037091
1028,Data Science,[Harvard University],"Learn key data science essentials, including R...",The demand for skilled data science practition...,"Fundamental R programming skills,Statistical c...","[Available now, Upcoming]","[Data Analysis & Statistics, Computer Science,...",[Introductory],[English],Program,[Professional Certificate],[rafael-irizarry],[],[],125573,https://www.edx.org/certificates/professional-...,,"[Github, Data Science, Unix, Git (Version Cont...",data science demand skilled data science pract...,0.038733
1312,Certified Forecaster and Demand Planner (CFDP),[International Supply Chain Education Alliance],ISCEA’s Certified Forecaster and Demand Planne...,Demand planning has always been an essential p...,"Supply chain management essentials, inventory ...","[Available now, Archived, Upcoming]","[Business & Management, Data Analysis & Statis...",[Introductory],[English],Program,[Professional Certificate],[jorge-morales],[],[],705,https://www.edx.org/certificates/professional-...,,"[Demand Planning, Inventory Management, Sales ...",certified forecaster demand planner cfdp deman...,0.043071
1269,Data Analytics for Digital Transformation,[Dartmouth College],The Data Analytics for Digital Transformation ...,Dartmouth Engineering’s Data Analytics for Dig...,Predictive Analytics: Build and validate model...,[Available now],"[Data Analysis & Statistics, Computer Science,...","[Introductory, Intermediate]",[English],Program,[Professional Certificate],"[reed-harder, vikrant-vaze]",[],[],991,https://www.edx.org/certificates/professional-...,,"[Digital Transformation, Planning, Resource Al...",data analytics digital transformation dartmout...,0.043092
1044,Statistics and Data Science (Methods Track),[Massachusetts Institute of Technology],Master different data science methods such as ...,Data scientists bring value to organizations a...,"Master the foundations of data science, statis...","[Available now, Archived, Upcoming]","[Data Analysis & Statistics, Math, Computer Sc...","[Advanced, Intermediate]",[English],Program,[MicroMasters],"[dimitri-bertsekas, philippe-rigollet, jan-chr...",[],[],38446,https://www.edx.org/masters/micromasters/mitx-...,,"[Decision Theories, Data Science, Time Series ...",statistic data science method track data scien...,0.043323
1249,Certified Lifestyle Medicine Executive,[Doane University],Get the skills you need to be a transformative...,Health systems around the world are confrontin...,Assess how to lead healthcare organizations in...,[Available now],"[Business & Management, Medicine, Economics & ...",[Intermediate],[English],Program,[MicroMasters],"[nicholas-king, alice-kindschuh, kimberley-mei...",[],[],1147,https://www.edx.org/masters/micromasters/doane...,,"[Healthcare Delivery Models, Management, Commu...",certified lifestyle medicine executive health ...,0.04413
1223,Corporate Finance and Valuation Methods,[New York Institute of Finance],This introductory program helps students under...,Develop an understanding of how the key princi...,Fundamental of Financial Mathematics and Capit...,[Available now],[Economics & Finance],[Introductory],[English],Program,[Professional Certificate],[douglas-carroll],[],[],1393,https://www.edx.org/certificates/professional-...,,"[Mathematical Finance, Discounted Cash Flow, C...",corporate finance valuation method develop und...,0.044678
1426,Machine Learning Operations with Amazon Web Se...,[Statistics.com],Machine Learning Operations (MLOps) is the cor...,Machine Learning Operations (MLOps) lies at th...,How to create a pipeline using AWS to ingest d...,"[Available now, Archived]","[Computer Science, Data Analysis & Statistics,...",[Intermediate],[English],Program,[Professional Certificate],"[evan-wimpey, vic-diloreto, laura-lancheros, g...",[],[],330,https://www.edx.org/certificates/professional-...,,"[Data Pipeline, Data Science, Data Engineering...",machine learning operation amazon web service ...,0.046392
1033,Data Analyst,[IBM],,This eight-course Professional Certificate fro...,"Create charts and plots in Excel, and work wit...","[Available now, Archived]","[Data Analysis & Statistics, Computer Science,...","[Introductory, Intermediate, Advanced]",[English],Program,[Professional Certificate],"[rav-ahuja, sandip-sasha-joy, steve-ryan, rame...",[],[],81813,https://www.edx.org/certificates/professional-...,,"[Data Manipulation, Python (Programming Langua...",data analyst eightcourse professional certific...,0.04695
1342,Business and Operations for a Circular Bio-Eco...,[Wageningen University & Research],Learn more about business and operations in a ...,The world is moving from a linear ‘cradle-to-g...,"Understand and evaluate strategic, technologic...",[Available now],"[Energy & Earth Sciences, Economics & Finance,...",[Advanced],[English],Program,[MicroMasters],"[maria-barbosa, jan-vreeburg, jos-bijman, argy...",[],[],584,https://www.edx.org/masters/micromasters/wagen...,,"[Business Strategies, Influencing Skills, Supp...",business operation circular bioeconomy world m...,0.047149


# FastText x WMD

In [22]:
kv_ft = fasttext_model(courses_text, save_path="app/models/ft_courses.kv")

dists = []
for text_ in courses_text:
    # distance between two list of word
    dist = kv_ft.wmdistance(nltk.word_tokenize(text), nltk.word_tokenize(text_))
    dists.append(dist)

ft_wmd_dist = pd.DataFrame(courses_data).iloc[np.argsort(dists), :]
ft_wmd_dist.loc[:,['wmd_dist']] = np.sort(dists)
ft_wmd_dist[:11]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,wmd_dist
1282,Essential quantitative business skills,[Tecnológico de Monterrey],Gain a solid foundation of essential quantitat...,"In today's work environment, it is vital for p...",You will develop your capacity for quantitativ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Program,[Professional Certificate],"[ivan-adolfo-valdovinos-hernandez, jose-antoni...",[],[],909,https://www.edx.org/certificates/professional-...,,"[Financial Modeling, Data Science, Finance, Fi...",essential quantitative business skill today wo...,0.987855
1146,Fundamentals of Financial Analysis,[Babson College],Learn the foundational accounting and finance ...,Making decisions based on financial data is es...,How primary financial statements are construct...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter]",[],[],3269,https://www.edx.org/certificates/professional-...,,"[Financial Statements, Financial Analysis, Fin...",fundamental financial analysis making decision...,1.025044
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,1.031466
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,1.034075
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,1.035142
108,Mathematical Methods for Quantitative Finance,[Massachusetts Institute of Technology],Learn the mathematical foundations essential f...,\nProbability distributions in finance\nTime-s...,Modern finance is the science of decision maki...,[Available now],"[Economics & Finance, Math, Engineering]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, paul-f-mende]","[Arabic, English, Spanish (Latin America), Ind...","[Portuguese - Brazil, Spanish, Arabic, Indones...",5782,https://www.edx.org/learn/finance/massachusett...,12.0,"[Planning, Financial Market, Chartered Financi...",mathematical method quantitative finance proba...,1.035565
850,Financial Statement Analysis: Company Forecast...,[Rice University],Strengthen your skills as an investor or an an...,You will be able to:\n\n\nAnalyze and effectiv...,Rice University’s online business courses offe...,[Available now],[Business & Management],[Introductory],[English],Course,[],[brian-akins],"[Arabic, English, Spanish (Latin America), Ind...","[Spanish, Portuguese - Brazil]",436,https://www.edx.org/learn/business-management/...,5.0,[],financial statement analysis company forecast ...,1.043995
296,Derivatives Markets: Advanced Modeling and Str...,[Massachusetts Institute of Technology],Financial derivatives are ubiquitous in global...,\nAdvanced derivatives pricing approaches adap...,Financial derivatives are ubiquitous in global...,[Available now],"[Economics & Finance, Data Analysis & Statistics]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, deborah-j-lucas]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Portuguese - Brazil, Indones...",1784,https://www.edx.org/learn/finance/massachusett...,12.0,"[Capital Markets, Commercial Banking, Hedge Fu...",derivative market advanced modeling strategy a...,1.046231
1016,Financial Decision-Making for Leaders,[Babson College],Learn financial and quantitative analysis for ...,"Literacy in business financials, regardless of...",Practical skills for evaluating and forecastin...,[Available now],"[Business & Management, Economics & Finance, D...",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter, rick-cleary, natha...",[],[],5061,https://www.edx.org/certificates/professional-...,,"[Finance, Performance Metric, Financial Foreca...",financial decisionmaking leader literacy busin...,1.053278
1276,Big Data,[University of Adelaide],Learn how to transform big data into business ...,Big data is changing the way businesses operat...,"How to design algorithms,Understand fundamenta...",[Available now],"[Computer Science, Data Analysis & Statistics]","[Intermediate, Introductory, Advanced]",[English],Program,[MicroMasters],"[aneta-neumann, dr-katrina-falkner, dr-claudia...",[],[],954,https://www.edx.org/masters/micromasters/adela...,,"[Analytical Techniques, Data Science, Computat...",big data big data changing way business operat...,1.060081


# Sentence Transformers x Cosine

In [None]:
corpus_embeddings, model = sentence_transformers_embeddings(courses_text, save_model_path = "app/models/st_model", save_index_path = "app/embeddings/st_courses.index")
# reverse the cos similarity value, so that 0 is the most similar
dists = 1-cosine_similarity([model.encode(text)], corpus_embeddings).flatten()
st_cos_dist = pd.DataFrame(courses_data).iloc[np.argsort(dists), :]
st_cos_dist.loc[:,['cos_dist']] = np.sort(dists)
st_cos_dist[:11]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
1292,Actuarial Science: Financial Math and Probability,[The University of Wisconsin-Madison],This certificate program consists of four cour...,This certificate program consists of four cour...,How to perform calculations relating to the pr...,[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Program,[Professional Certificate],[gordon-enderle-6],[],[],839,https://www.edx.org/certificates/professional-...,,"[Cash Flows, Mathematical Finance, Random Vari...",actuarial science financial math probability c...,0.352453
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.379374
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.388441
938,Financial Analysis of Insurance Companies – In...,[New York Institute of Finance],"Take a deep dive into the operating practices,...",\nDescribe major industry trends and challenge...,Want to gain a solid understanding of the uniq...,[Available now],[Economics & Finance],[Advanced],[English],Course,[Professional Certificate],[jack-farmer],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, French, Hindi, Indonesian, Telugu, Po...",376,https://www.edx.org/learn/financial-analysis/n...,3.0,"[Generally Accepted Accounting Principles, Sol...",financial analysis insurance company industry ...,0.39166
871,Finance for Non-finance Professionals,[University of Cambridge],This course covers the essential skills that n...,\nEssential financial acumen and accounting fu...,Finance and accounting.\nThis module covers th...,[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],[doug-williamson],[],[],424,https://www.edx.org/learn/economics-finance/un...,5.0,[],finance nonfinance professional essential fina...,0.398511
1016,Financial Decision-Making for Leaders,[Babson College],Learn financial and quantitative analysis for ...,"Literacy in business financials, regardless of...",Practical skills for evaluating and forecastin...,[Available now],"[Business & Management, Economics & Finance, D...",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter, rick-cleary, natha...",[],[],5061,https://www.edx.org/certificates/professional-...,,"[Finance, Performance Metric, Financial Foreca...",financial decisionmaking leader literacy busin...,0.406254
625,Developing the Risk Management Plan with Exper...,"[The University of Maryland, College Park]",Every project faces risk. The project risk man...,● The fundamentals of risk management and thei...,"Project risk management involves identifying, ...",[Available now],"[Engineering, Business & Management, Philosoph...",[Intermediate],[English],Course,[Professional Certificate],[gregory-baecher],"[Arabic, English, Spanish (Latin America), Ind...","[French, Arabic, Portuguese - Brazil, German, ...",644,https://www.edx.org/learn/risk-management/the-...,5.0,"[Planning, Project Risk Management, Cognitive ...",developing risk management plan expert judgeme...,0.429286
1146,Fundamentals of Financial Analysis,[Babson College],Learn the foundational accounting and finance ...,Making decisions based on financial data is es...,How primary financial statements are construct...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter]",[],[],3269,https://www.edx.org/certificates/professional-...,,"[Financial Statements, Financial Analysis, Fin...",fundamental financial analysis making decision...,0.432655
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,0.442066
1391,Financial Analysis of Insurance Companies,[New York Institute of Finance],,Financial Analysis of Insurance Companies Prof...,Describe major industry trends and challenges....,[Available now],"[Economics & Finance, Business & Management]",[Advanced],[English],Program,[Professional Certificate],[jack-farmer],[],[],410,https://www.edx.org/certificates/professional-...,,"[Financial Analysis, Casualty Insurance]",financial analysis insurance company financial...,0.448696


# Methods Evaluation

## Traditional Methods

In [35]:
display(bow_cos_dist[:10])

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,0.686791
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.704315
486,Statistical Inference and Modeling for High-th...,[Harvard University],A focus on the techniques commonly used to per...,\nOrganizing high throughput data\nMultiple co...,In this course you’ll learn various statistics...,[Available now],"[Data Analysis & Statistics, Biology & Life Sc...",[Intermediate],[English],Course,"[XSeries, Professional Certificate]","[michael-love, rafael-irizarry]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Hindi, Telugu, Indonesian, Swahili, P...",915,https://www.edx.org/learn/statistics/harvard-u...,4.0,"[Life Sciences, Matrix Algebra, Exploratory Da...",statistical inference modeling highthroughput ...,0.708139
1282,Essential quantitative business skills,[Tecnológico de Monterrey],Gain a solid foundation of essential quantitat...,"In today's work environment, it is vital for p...",You will develop your capacity for quantitativ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Program,[Professional Certificate],"[ivan-adolfo-valdovinos-hernandez, jose-antoni...",[],[],909,https://www.edx.org/certificates/professional-...,,"[Financial Modeling, Data Science, Finance, Fi...",essential quantitative business skill today wo...,0.710327
415,Risk Management and Credit Principles,[New York Institute of Finance],"Get introduced to risk management, credit prin...",\nIntroduction to Risk Management and Credit P...,This course is Part 1 of the Credit and Credit...,[Available now],[Economics & Finance],[Advanced],[English],Course,"[Professional Certificate, Professional Certif...",[tracy-williams],"[Arabic, English, Spanish (Latin America), Ind...","[German, Spanish, Swahili, Portuguese - Brazil...",1137,https://www.edx.org/learn/risk-management/new-...,6.0,"[Credit Risk, Credit Analysis, Loss Given Defa...",risk management credit principle introduction ...,0.740077
752,Introduction to Data Science and Basic Statist...,[Tecnológico de Monterrey],In this course you will acquire statistical me...,"Through this course, participants will be able...",This course allows you to develop skills of a ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Course,[Professional Certificate],[jose-antonio-nunez-mora],"[Arabic, English, Spanish (Latin America), Ind...",[],517,https://www.edx.org/learn/data-analysis/tecnol...,4.0,"[Probability Distribution, Decision Making, Da...",introduction data science basic statistic busi...,0.740102
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.740977
317,Financial Analysis for Decision Making,[Babson College],Learn how to analyze business opportunities fo...,\nTools and techniques for funding a growing b...,How do you find the money necessary to effecti...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Course,"[XSeries, Professional Certificate, Profession...",[mark-potter],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Portuguese - Brazil, Indones...",1659,https://www.edx.org/learn/financial-analysis/b...,4.0,"[Financial Analysis, Securities (Finance), Fin...",financial analysis decision making tool techni...,0.746832
383,Analytics for Decision Making,[Babson College],Discover the foundational concepts that suppor...,\nVariability in the real world and implicatio...,Want to know how to avoid bad decisions with d...,[Available now],"[Business & Management, Data Analysis & Statis...",[Introductory],[English],Course,"[XSeries, Professional Certificate, Profession...","[davit-khachatryan, nathan-karst, george-recck...","[Arabic, English, Spanish (Latin America), Ind...","[Spanish, Portuguese - Brazil, Arabic, Indones...",1268,https://www.edx.org/learn/decision-making/babs...,4.0,"[Statistical Thinking, Quantitative Models Of ...",analytics decision making variability real wor...,0.747009
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,0.747626


In [34]:
display(tfidf_cos_dist[:10])

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.652402
1292,Actuarial Science: Financial Math and Probability,[The University of Wisconsin-Madison],This certificate program consists of four cour...,This certificate program consists of four cour...,How to perform calculations relating to the pr...,[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Program,[Professional Certificate],[gordon-enderle-6],[],[],839,https://www.edx.org/certificates/professional-...,,"[Cash Flows, Mathematical Finance, Random Vari...",actuarial science financial math probability c...,0.736053
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.740282
1230,The FinTech: Future of Payments,[University of Toronto],Learn how financial innovations impact the glo...,"Over the last decade, the world has seen a tra...","Gain insight into the past, present, and futur...",[Available now],[Economics & Finance],[Introductory],[English],Program,[Professional Certificate],[andreas-park-3],[],[],1325,https://www.edx.org/certificates/professional-...,,"[Blockchain, Finance, Financial Technology (Fi...",fintech future payment last decade world seen ...,0.77866
486,Statistical Inference and Modeling for High-th...,[Harvard University],A focus on the techniques commonly used to per...,\nOrganizing high throughput data\nMultiple co...,In this course you’ll learn various statistics...,[Available now],"[Data Analysis & Statistics, Biology & Life Sc...",[Intermediate],[English],Course,"[XSeries, Professional Certificate]","[michael-love, rafael-irizarry]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Hindi, Telugu, Indonesian, Swahili, P...",915,https://www.edx.org/learn/statistics/harvard-u...,4.0,"[Life Sciences, Matrix Algebra, Exploratory Da...",statistical inference modeling highthroughput ...,0.8177
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,0.819472
938,Financial Analysis of Insurance Companies – In...,[New York Institute of Finance],"Take a deep dive into the operating practices,...",\nDescribe major industry trends and challenge...,Want to gain a solid understanding of the uniq...,[Available now],[Economics & Finance],[Advanced],[English],Course,[Professional Certificate],[jack-farmer],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, French, Hindi, Indonesian, Telugu, Po...",376,https://www.edx.org/learn/financial-analysis/n...,3.0,"[Generally Accepted Accounting Principles, Sol...",financial analysis insurance company industry ...,0.819794
1391,Financial Analysis of Insurance Companies,[New York Institute of Finance],,Financial Analysis of Insurance Companies Prof...,Describe major industry trends and challenges....,[Available now],"[Economics & Finance, Business & Management]",[Advanced],[English],Program,[Professional Certificate],[jack-farmer],[],[],410,https://www.edx.org/certificates/professional-...,,"[Financial Analysis, Casualty Insurance]",financial analysis insurance company financial...,0.829323
1153,Personal Finance,[Indiana University],The Personal Finance Professional Certificate ...,Create Your Financial Plan: The Personal Finan...,This certificate is part of the Brian D. Jelli...,[Available now],[Economics & Finance],[Introductory],[English],Program,[Professional Certificate],"[kenneth-carrow, todd-roberson]",[],[],3197,https://www.edx.org/certificates/professional-...,,"[Financial Services, Financial Literacy, Perso...",personal finance create financial plan persona...,0.832927
75,Foundations of Finance,[University of Cambridge],"This course provides a rigorous, but straightf...",\nMoney and Capital\nCash flows and Cash flow ...,"This course provides a rigorous, but straightf...","[Available now, Archived]",[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],[doug-williamson],[],[],8863,https://www.edx.org/learn/finance/university-o...,5.0,"[Corporate Finance, Financial Management, Prob...",foundation finance money capital cash flow cas...,0.844056


Both BoW and TF-IDF mostly produce results that are relevant to our input:
1. Career is actuaries
2. Major is actuarial science
3. Interests & skills are in finance, math, statistics, risk assessment, retirement planning

Recall that this is the preprocessed input text, a combination of career information, user's major, interests, and skills:

'actuary actuarial analyst actuary consulting actuary pricing actuary analyze statistical data mortality accident sickness disability retirement rate construct probability table forecast risk liability payment future benefit may ascertain insurance rate required cash reserve necessary ensure payment future benefit ascertain premium rate required cash reserve liability necessary ensure payment future benefit collaborate programmer underwriter account claim expert senior management help company develop plan new line business improvement existing business analyze statistical information estimate mortality accident sickness disability retirement rate actuarial science financial risk analysis insurance modeling mathematical modeling finance forecasting simulation pension retirement planning datadriven decision making statistical analysis risk assessment financial modeling data interpretation problemsolving attention detail'


Performance between TF-IDF and BoW is slightly the same, however the top 1 course for BoW is "Cybersecurity Risk Management" which is not really related to finance & actuarial science. It is probably there because the frequent presence of the word "risk". This course also appears in TF-IDF method although in sixth place.

## Modern Methods

In [33]:
w2v_wmd_dist[:10]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,wmd_dist
1282,Essential quantitative business skills,[Tecnológico de Monterrey],Gain a solid foundation of essential quantitat...,"In today's work environment, it is vital for p...",You will develop your capacity for quantitativ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Program,[Professional Certificate],"[ivan-adolfo-valdovinos-hernandez, jose-antoni...",[],[],909,https://www.edx.org/certificates/professional-...,,"[Financial Modeling, Data Science, Finance, Fi...",essential quantitative business skill today wo...,0.806005
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.810218
1146,Fundamentals of Financial Analysis,[Babson College],Learn the foundational accounting and finance ...,Making decisions based on financial data is es...,How primary financial statements are construct...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter]",[],[],3269,https://www.edx.org/certificates/professional-...,,"[Financial Statements, Financial Analysis, Fin...",fundamental financial analysis making decision...,0.825016
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,0.830947
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.843338
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,0.847774
108,Mathematical Methods for Quantitative Finance,[Massachusetts Institute of Technology],Learn the mathematical foundations essential f...,\nProbability distributions in finance\nTime-s...,Modern finance is the science of decision maki...,[Available now],"[Economics & Finance, Math, Engineering]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, paul-f-mende]","[Arabic, English, Spanish (Latin America), Ind...","[Portuguese - Brazil, Spanish, Arabic, Indones...",5782,https://www.edx.org/learn/finance/massachusett...,12.0,"[Planning, Financial Market, Chartered Financi...",mathematical method quantitative finance proba...,0.848932
850,Financial Statement Analysis: Company Forecast...,[Rice University],Strengthen your skills as an investor or an an...,You will be able to:\n\n\nAnalyze and effectiv...,Rice University’s online business courses offe...,[Available now],[Business & Management],[Introductory],[English],Course,[],[brian-akins],"[Arabic, English, Spanish (Latin America), Ind...","[Spanish, Portuguese - Brazil]",436,https://www.edx.org/learn/business-management/...,5.0,[],financial statement analysis company forecast ...,0.852087
1016,Financial Decision-Making for Leaders,[Babson College],Learn financial and quantitative analysis for ...,"Literacy in business financials, regardless of...",Practical skills for evaluating and forecastin...,[Available now],"[Business & Management, Economics & Finance, D...",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter, rick-cleary, natha...",[],[],5061,https://www.edx.org/certificates/professional-...,,"[Finance, Performance Metric, Financial Foreca...",financial decisionmaking leader literacy busin...,0.854624
296,Derivatives Markets: Advanced Modeling and Str...,[Massachusetts Institute of Technology],Financial derivatives are ubiquitous in global...,\nAdvanced derivatives pricing approaches adap...,Financial derivatives are ubiquitous in global...,[Available now],"[Economics & Finance, Data Analysis & Statistics]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, deborah-j-lucas]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Portuguese - Brazil, Indones...",1784,https://www.edx.org/learn/finance/massachusett...,12.0,"[Capital Markets, Commercial Banking, Hedge Fu...",derivative market advanced modeling strategy a...,0.85647


In [36]:
w2v_cos_dist[:10]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
1049,Finance,[Massachusetts Institute of Technology],Accelerate your finance career. Fast-track you...,The skills and expertise required for a career...,"Comprehend the foundations of modern finance.,...","[Available now, Archived, Upcoming]","[Economics & Finance, Data Analysis & Statisti...",[Advanced],[English],Program,[MicroMasters],"[egor-matveyev, paul-f-mende, leonid-kogan, ji...",[],[],33254,https://www.edx.org/masters/micromasters/mitx-...,,"[Financial Analysis, Derivatives Markets, Fina...",finance skill expertise required career financ...,0.037091
1028,Data Science,[Harvard University],"Learn key data science essentials, including R...",The demand for skilled data science practition...,"Fundamental R programming skills,Statistical c...","[Available now, Upcoming]","[Data Analysis & Statistics, Computer Science,...",[Introductory],[English],Program,[Professional Certificate],[rafael-irizarry],[],[],125573,https://www.edx.org/certificates/professional-...,,"[Github, Data Science, Unix, Git (Version Cont...",data science demand skilled data science pract...,0.038733
1312,Certified Forecaster and Demand Planner (CFDP),[International Supply Chain Education Alliance],ISCEA’s Certified Forecaster and Demand Planne...,Demand planning has always been an essential p...,"Supply chain management essentials, inventory ...","[Available now, Archived, Upcoming]","[Business & Management, Data Analysis & Statis...",[Introductory],[English],Program,[Professional Certificate],[jorge-morales],[],[],705,https://www.edx.org/certificates/professional-...,,"[Demand Planning, Inventory Management, Sales ...",certified forecaster demand planner cfdp deman...,0.043071
1269,Data Analytics for Digital Transformation,[Dartmouth College],The Data Analytics for Digital Transformation ...,Dartmouth Engineering’s Data Analytics for Dig...,Predictive Analytics: Build and validate model...,[Available now],"[Data Analysis & Statistics, Computer Science,...","[Introductory, Intermediate]",[English],Program,[Professional Certificate],"[reed-harder, vikrant-vaze]",[],[],991,https://www.edx.org/certificates/professional-...,,"[Digital Transformation, Planning, Resource Al...",data analytics digital transformation dartmout...,0.043092
1044,Statistics and Data Science (Methods Track),[Massachusetts Institute of Technology],Master different data science methods such as ...,Data scientists bring value to organizations a...,"Master the foundations of data science, statis...","[Available now, Archived, Upcoming]","[Data Analysis & Statistics, Math, Computer Sc...","[Advanced, Intermediate]",[English],Program,[MicroMasters],"[dimitri-bertsekas, philippe-rigollet, jan-chr...",[],[],38446,https://www.edx.org/masters/micromasters/mitx-...,,"[Decision Theories, Data Science, Time Series ...",statistic data science method track data scien...,0.043323
1249,Certified Lifestyle Medicine Executive,[Doane University],Get the skills you need to be a transformative...,Health systems around the world are confrontin...,Assess how to lead healthcare organizations in...,[Available now],"[Business & Management, Medicine, Economics & ...",[Intermediate],[English],Program,[MicroMasters],"[nicholas-king, alice-kindschuh, kimberley-mei...",[],[],1147,https://www.edx.org/masters/micromasters/doane...,,"[Healthcare Delivery Models, Management, Commu...",certified lifestyle medicine executive health ...,0.04413
1223,Corporate Finance and Valuation Methods,[New York Institute of Finance],This introductory program helps students under...,Develop an understanding of how the key princi...,Fundamental of Financial Mathematics and Capit...,[Available now],[Economics & Finance],[Introductory],[English],Program,[Professional Certificate],[douglas-carroll],[],[],1393,https://www.edx.org/certificates/professional-...,,"[Mathematical Finance, Discounted Cash Flow, C...",corporate finance valuation method develop und...,0.044678
1426,Machine Learning Operations with Amazon Web Se...,[Statistics.com],Machine Learning Operations (MLOps) is the cor...,Machine Learning Operations (MLOps) lies at th...,How to create a pipeline using AWS to ingest d...,"[Available now, Archived]","[Computer Science, Data Analysis & Statistics,...",[Intermediate],[English],Program,[Professional Certificate],"[evan-wimpey, vic-diloreto, laura-lancheros, g...",[],[],330,https://www.edx.org/certificates/professional-...,,"[Data Pipeline, Data Science, Data Engineering...",machine learning operation amazon web service ...,0.046392
1033,Data Analyst,[IBM],,This eight-course Professional Certificate fro...,"Create charts and plots in Excel, and work wit...","[Available now, Archived]","[Data Analysis & Statistics, Computer Science,...","[Introductory, Intermediate, Advanced]",[English],Program,[Professional Certificate],"[rav-ahuja, sandip-sasha-joy, steve-ryan, rame...",[],[],81813,https://www.edx.org/certificates/professional-...,,"[Data Manipulation, Python (Programming Langua...",data analyst eightcourse professional certific...,0.04695
1342,Business and Operations for a Circular Bio-Eco...,[Wageningen University & Research],Learn more about business and operations in a ...,The world is moving from a linear ‘cradle-to-g...,"Understand and evaluate strategic, technologic...",[Available now],"[Energy & Earth Sciences, Economics & Finance,...",[Advanced],[English],Program,[MicroMasters],"[maria-barbosa, jan-vreeburg, jos-bijman, argy...",[],[],584,https://www.edx.org/masters/micromasters/wagen...,,"[Business Strategies, Influencing Skills, Supp...",business operation circular bioeconomy world m...,0.047149


In [37]:
ft_wmd_dist[:10]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,wmd_dist
1282,Essential quantitative business skills,[Tecnológico de Monterrey],Gain a solid foundation of essential quantitat...,"In today's work environment, it is vital for p...",You will develop your capacity for quantitativ...,"[Available now, Archived]","[Data Analysis & Statistics, Business & Manage...",[Advanced],[Spanish],Program,[Professional Certificate],"[ivan-adolfo-valdovinos-hernandez, jose-antoni...",[],[],909,https://www.edx.org/certificates/professional-...,,"[Financial Modeling, Data Science, Finance, Fi...",essential quantitative business skill today wo...,0.987855
1146,Fundamentals of Financial Analysis,[Babson College],Learn the foundational accounting and finance ...,Making decisions based on financial data is es...,How primary financial statements are construct...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter]",[],[],3269,https://www.edx.org/certificates/professional-...,,"[Financial Statements, Financial Analysis, Fin...",fundamental financial analysis making decision...,1.025044
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,1.031466
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,1.034075
935,Cybersecurity Risk Management,[Rochester Institute of Technology],"Learn key principles of risk analysis, risk as...",\nInformation security risk management framewo...,Cybersecurity risk management guides a growing...,[Available now],[Computer Science],[Advanced],[English],Course,[MicroMasters],[tong-sun],"[Arabic, English, Spanish (Latin America), Ind...","[Hindi, Indonesian, Swahili, Telugu, Portugues...",376,https://www.edx.org/learn/risk-management/roch...,8.0,"[Quantitative Research, Analytical Techniques,...",cybersecurity risk management information secu...,1.035142
108,Mathematical Methods for Quantitative Finance,[Massachusetts Institute of Technology],Learn the mathematical foundations essential f...,\nProbability distributions in finance\nTime-s...,Modern finance is the science of decision maki...,[Available now],"[Economics & Finance, Math, Engineering]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, paul-f-mende]","[Arabic, English, Spanish (Latin America), Ind...","[Portuguese - Brazil, Spanish, Arabic, Indones...",5782,https://www.edx.org/learn/finance/massachusett...,12.0,"[Planning, Financial Market, Chartered Financi...",mathematical method quantitative finance proba...,1.035565
850,Financial Statement Analysis: Company Forecast...,[Rice University],Strengthen your skills as an investor or an an...,You will be able to:\n\n\nAnalyze and effectiv...,Rice University’s online business courses offe...,[Available now],[Business & Management],[Introductory],[English],Course,[],[brian-akins],"[Arabic, English, Spanish (Latin America), Ind...","[Spanish, Portuguese - Brazil]",436,https://www.edx.org/learn/business-management/...,5.0,[],financial statement analysis company forecast ...,1.043995
296,Derivatives Markets: Advanced Modeling and Str...,[Massachusetts Institute of Technology],Financial derivatives are ubiquitous in global...,\nAdvanced derivatives pricing approaches adap...,Financial derivatives are ubiquitous in global...,[Available now],"[Economics & Finance, Data Analysis & Statistics]",[Advanced],[English],Course,"[MicroMasters, MicroMasters]","[egor-matveyev, deborah-j-lucas]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Portuguese - Brazil, Indones...",1784,https://www.edx.org/learn/finance/massachusett...,12.0,"[Capital Markets, Commercial Banking, Hedge Fu...",derivative market advanced modeling strategy a...,1.046231
1016,Financial Decision-Making for Leaders,[Babson College],Learn financial and quantitative analysis for ...,"Literacy in business financials, regardless of...",Practical skills for evaluating and forecastin...,[Available now],"[Business & Management, Economics & Finance, D...",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter, rick-cleary, natha...",[],[],5061,https://www.edx.org/certificates/professional-...,,"[Finance, Performance Metric, Financial Foreca...",financial decisionmaking leader literacy busin...,1.053278
1276,Big Data,[University of Adelaide],Learn how to transform big data into business ...,Big data is changing the way businesses operat...,"How to design algorithms,Understand fundamenta...",[Available now],"[Computer Science, Data Analysis & Statistics]","[Intermediate, Introductory, Advanced]",[English],Program,[MicroMasters],"[aneta-neumann, dr-katrina-falkner, dr-claudia...",[],[],954,https://www.edx.org/masters/micromasters/adela...,,"[Analytical Techniques, Data Science, Computat...",big data big data changing way business operat...,1.060081


In [38]:
st_cos_dist[:10]

Unnamed: 0,title,partner,primary_description,secondary_description,tertiary_description,availability,subject,level,language,product,program_type,staff,translation_language,transcription_language,recent_enrollment_count,marketing_url,weeks_to_complete,skill,text,cos_dist
1292,Actuarial Science: Financial Math and Probability,[The University of Wisconsin-Madison],This certificate program consists of four cour...,This certificate program consists of four cour...,How to perform calculations relating to the pr...,[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Program,[Professional Certificate],[gordon-enderle-6],[],[],839,https://www.edx.org/certificates/professional-...,,"[Cash Flows, Mathematical Finance, Random Vari...",actuarial science financial math probability c...,0.352453
834,Planning for Risk and Retirement,[Indiana University],"Learn to assess your tolerance for risk, evalu...","Through course videos, selected short readings...","Planning for Risk, Retirement and Investment i...",[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],"[kenneth-carrow, todd-roberson]","[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian, Portuguese - Bra...",450,https://www.edx.org/learn/risk-management/indi...,4.0,"[Planning, Financial Services, Financial Liter...",planning risk retirement course video selected...,0.379374
877,Financial Math for Actuaries: From Rates to An...,[The University of Wisconsin-Madison],Start your actuarial career! Study the time-va...,Those enrolled in Financial Math for Actuaries...,"In preparation for SOA Exam FM / CAS Exam 2, t...",[Available now],"[Economics & Finance, Math]",[Intermediate],[English],Course,[Professional Certificate],[gordon-enderle-6],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, Spanish, Indonesian]",421,https://www.edx.org/learn/actuarial-science/th...,6.0,"[Time Value Of Money, Cash Flows, Mathematical...",financial math actuary rate annuity enrolled f...,0.388441
938,Financial Analysis of Insurance Companies – In...,[New York Institute of Finance],"Take a deep dive into the operating practices,...",\nDescribe major industry trends and challenge...,Want to gain a solid understanding of the uniq...,[Available now],[Economics & Finance],[Advanced],[English],Course,[Professional Certificate],[jack-farmer],"[Arabic, English, Spanish (Latin America), Ind...","[Arabic, French, Hindi, Indonesian, Telugu, Po...",376,https://www.edx.org/learn/financial-analysis/n...,3.0,"[Generally Accepted Accounting Principles, Sol...",financial analysis insurance company industry ...,0.39166
871,Finance for Non-finance Professionals,[University of Cambridge],This course covers the essential skills that n...,\nEssential financial acumen and accounting fu...,Finance and accounting.\nThis module covers th...,[Available now],[Economics & Finance],[Introductory],[English],Course,[Professional Certificate],[doug-williamson],[],[],424,https://www.edx.org/learn/economics-finance/un...,5.0,[],finance nonfinance professional essential fina...,0.398511
1016,Financial Decision-Making for Leaders,[Babson College],Learn financial and quantitative analysis for ...,"Literacy in business financials, regardless of...",Practical skills for evaluating and forecastin...,[Available now],"[Business & Management, Economics & Finance, D...",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter, rick-cleary, natha...",[],[],5061,https://www.edx.org/certificates/professional-...,,"[Finance, Performance Metric, Financial Foreca...",financial decisionmaking leader literacy busin...,0.406254
625,Developing the Risk Management Plan with Exper...,"[The University of Maryland, College Park]",Every project faces risk. The project risk man...,● The fundamentals of risk management and thei...,"Project risk management involves identifying, ...",[Available now],"[Engineering, Business & Management, Philosoph...",[Intermediate],[English],Course,[Professional Certificate],[gregory-baecher],"[Arabic, English, Spanish (Latin America), Ind...","[French, Arabic, Portuguese - Brazil, German, ...",644,https://www.edx.org/learn/risk-management/the-...,5.0,"[Planning, Project Risk Management, Cognitive ...",developing risk management plan expert judgeme...,0.429286
1146,Fundamentals of Financial Analysis,[Babson College],Learn the foundational accounting and finance ...,Making decisions based on financial data is es...,How primary financial statements are construct...,[Available now],"[Business & Management, Economics & Finance]",[Introductory],[English],Program,[Professional Certificate],"[peter-wilson, mark-potter]",[],[],3269,https://www.edx.org/certificates/professional-...,,"[Financial Statements, Financial Analysis, Fin...",fundamental financial analysis making decision...,0.432655
1556,Statistics for Business Analytics,[The University of Queensland],,Business analytics is the ability to collate a...,Use basic probability concepts and apply vario...,[Available now],"[Data Analysis & Statistics, Business & Manage...",[Introductory],[English],Program,[Professional Certificate],[temesgen-kifle],[],[],-34,https://www.edx.org/certificates/professional-...,,"[Statistical Analysis, Statistical Inference, ...",statistic business analytics business analytic...,0.442066
1391,Financial Analysis of Insurance Companies,[New York Institute of Finance],,Financial Analysis of Insurance Companies Prof...,Describe major industry trends and challenges....,[Available now],"[Economics & Finance, Business & Management]",[Advanced],[English],Program,[Professional Certificate],[jack-farmer],[],[],410,https://www.edx.org/certificates/professional-...,,"[Financial Analysis, Casualty Insurance]",financial analysis insurance company financial...,0.448696


If we were to rank them based on relevance, the order would be:

1. SentenceTransformer × Cosine
2. Word2Vec (Pre-Trained) × WMD
3. FastText (Not Pre-Trained) × WMD
4. Word2Vec (Pre-Trained) × Cosine

The Word2Vec × Cosine approach produced results that were worse than BoW and TF-IDF, as it introduced more courses related to data and statistics rather than finance and actuarial science.

On the other hand, Word2Vec × WMD and FastText × WMD delivered similar performance. Both methods suggested courses relevant to finance, mathematics, and actuarial science.

We conclude that the best-performing method is SentenceTransformer × Cosine, primarily because it was the only method that did not recommend "Cybersecurity Risk Management". This is due to SentenceTransformer’s ability to capture semantic meaning rather than relying on word frequency. While the word "risk" appears in the input text, SentenceTransformer correctly interprets it in the context of financial risk, not in the broader or unrelated domain of information technology.

# Conclusion

The most effective method among all those evaluated is SentenceTransformer with Cosine similarity. Its strength lies in its ability to exclude irrelevant results, such as "Cybersecurity Risk Management". This is due to its focus on capturing semantic meaning.

For example, while the input text contains the word "risk", the model correctly interprets it in a financial context—based on surrounding words, rather than suggesting unrelated fields like information technology.

To improve response time when the API is called, both the model and its precomputed embeddings are saved in advance for quick loading.