In [107]:
import pandas as pd

df = pd.read_csv("shl_scraped_product_catalog.csv")

In [108]:
df.head()

Unnamed: 0.1,Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support
0,0,Account Manager Solution/,https://www.shl.com/solutions/products/product...,The Account Manager solution is an assessment ...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes
1,1,Administrative Professional Short Form/,https://www.shl.com/solutions/products/product...,The Administrative Professional solution is fo...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes
2,2,Agency Manager Solution/,https://www.shl.com/solutions/products/product...,The Agency Manager solution is for mid-level s...,Approximate Completion Time in minutes = 51,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes
3,3,Apprentice 8 0 Job Focused Assessment 4261/,https://www.shl.com/solutions/products/product...,The Apprentice + 8.0 Job-Focused Assessment is...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No
4,4,Apprentice 8 0 Job Focused Assessment/,https://www.shl.com/solutions/products/product...,The Apprentice 8.0 Job-Focused Assessment is a...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No


In [109]:
df.columns

Index(['Unnamed: 0', 'Assessment Name', 'URL', 'Description',
       'Assessment Duration', 'Test Type', 'Remote Testing Support',
       'Adaptive/IRT Support'],
      dtype='object')

### Basic Preprocessing of Scraped Text

In [110]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['Assessment Name'] = df['Assessment Name'].apply(preprocess_text)
df['Description'] = df['Description'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GuptaSub\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GuptaSub\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GuptaSub\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [111]:
df.head()

Unnamed: 0.1,Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support
0,0,account manager,https://www.shl.com/solutions/products/product...,account manager solution assessment used job c...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes
1,1,administrative professional short,https://www.shl.com/solutions/products/product...,administrative professional solution entry pos...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes
2,2,agency manager,https://www.shl.com/solutions/products/product...,agency manager solution sale management positi...,Approximate Completion Time in minutes = 51,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes
3,3,apprentice job focused assessment,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No
4,4,apprentice job focused,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No


In [112]:
int(df['Assessment Duration'][0].split(' ')[-1])

49

In [113]:
df['Assessment Length'] = df['Assessment Duration'].apply(lambda x: int(x.split(' ')[-1]))
df['Description Length'] = df['Description'].apply(lambda x: len(x.split()))
df

Unnamed: 0.1,Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support,Assessment Length,Description Length
0,0,account manager,https://www.shl.com/solutions/products/product...,account manager solution assessment used job c...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes,49,54
1,1,administrative professional short,https://www.shl.com/solutions/products/product...,administrative professional solution entry pos...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes,36,47
2,2,agency manager,https://www.shl.com/solutions/products/product...,agency manager solution sale management positi...,Approximate Completion Time in minutes = 51,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,51,48
3,3,apprentice job focused assessment,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No,30,29
4,4,apprentice job focused,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No,20,24
...,...,...,...,...,...,...,...,...,...,...
379,379,bank operation supervisor short,https://www.shl.com/solutions/products/product...,bank operation supervisor solution job candida...,Approximate Completion Time in minutes = 45,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,45,71
380,380,bilingual spanish reservation agent,https://www.shl.com/solutions/products/product...,bilingual reservation agent solution designed ...,Approximate Completion Time in minutes = 43,"B, P, S, A, A, B, C, D, E, K, P, S",Yes,Yes,43,52
381,381,bookkeeping accounting auditing clerk short,https://www.shl.com/solutions/products/product...,bookkeeping accounting auditing clerk solution...,Approximate Completion Time in minutes = 49,"P, S, K, B, A, A, B, C, D, E, K, P, S",Yes,Yes,49,48
382,382,branch manager short,https://www.shl.com/solutions/products/product...,solution mid financial institution managerial ...,Approximate Completion Time in minutes = 50,"A, B, P, A, B, C, D, E, K, P, S",Yes,No,50,41


In [115]:
df = df.drop("Unnamed: 0", axis = "columns")

In [116]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import LatentDirichletAllocation

# vectorizer = TfidfVectorizer(max_features=5000)
# tfidf_matrix = vectorizer.fit_transform(df['Assessment Name'])

# #Check this on the link :- "https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html"
# lda_model = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# lda_matrix = lda_model.fit_transform(tfidf_matrix)


In [117]:
# lda_model


In [118]:
# lda_matrix

In [None]:
pip install faiss-cpu #faiss-gpu if you have GPU

Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl (13.7 MB)
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/13.7 MB 262.6 kB/s eta 0:00:53
   ---------------------------------------- 0.1/13.7 MB 901.1 kB/s eta 0:00:16
    --------------------------------------- 0.3/13.7 MB 1.9 MB/s eta 0:00:08
   - -------------------------------------- 0.5/13.7 MB 2.1 MB/s eta 0:00:07
   - -------------------------------------- 0.6/13.7 MB 2.4 MB/s eta 0:00:06
   -- ------------------------------------- 0.8/13.7 MB 2.5 MB/s eta 0:00:06
   -- ------------------------------------- 1.0/13.7 MB 2.7 MB/s eta 0:00:05
   --- -

In [131]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

# Select the column(s) you want to encode

df['combined_data_for_knowledge'] = df.apply(lambda row: list(row.values), axis=1)

df




Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support,Assessment Length,Description Length,combined_data_for_knowledge
0,account manager,https://www.shl.com/solutions/products/product...,account manager solution assessment used job c...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes,49,54,"[account manager, https://www.shl.com/solution..."
1,administrative professional short,https://www.shl.com/solutions/products/product...,administrative professional solution entry pos...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes,36,47,"[administrative professional short, https://ww..."
2,agency manager,https://www.shl.com/solutions/products/product...,agency manager solution sale management positi...,Approximate Completion Time in minutes = 51,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,51,48,"[agency manager, https://www.shl.com/solutions..."
3,apprentice job focused assessment,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No,30,29,"[apprentice job focused assessment, https://ww..."
4,apprentice job focused,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No,20,24,"[apprentice job focused, https://www.shl.com/s..."
...,...,...,...,...,...,...,...,...,...,...
379,bank operation supervisor short,https://www.shl.com/solutions/products/product...,bank operation supervisor solution job candida...,Approximate Completion Time in minutes = 45,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,45,71,"[bank operation supervisor short, https://www...."
380,bilingual spanish reservation agent,https://www.shl.com/solutions/products/product...,bilingual reservation agent solution designed ...,Approximate Completion Time in minutes = 43,"B, P, S, A, A, B, C, D, E, K, P, S",Yes,Yes,43,52,"[bilingual spanish reservation agent, https://..."
381,bookkeeping accounting auditing clerk short,https://www.shl.com/solutions/products/product...,bookkeeping accounting auditing clerk solution...,Approximate Completion Time in minutes = 49,"P, S, K, B, A, A, B, C, D, E, K, P, S",Yes,Yes,49,48,"[bookkeeping accounting auditing clerk short, ..."
382,branch manager short,https://www.shl.com/solutions/products/product...,solution mid financial institution managerial ...,Approximate Completion Time in minutes = 50,"A, B, P, A, B, C, D, E, K, P, S",Yes,No,50,41,"[branch manager short, https://www.shl.com/sol..."


In [120]:
len(df)

384

In [134]:
newdf = df.drop_duplicates(subset = ['Assessment Name'])


In [140]:
len(newdf)

12

In [135]:
newdf

Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support,Assessment Length,Description Length,combined_data_for_knowledge
0,account manager,https://www.shl.com/solutions/products/product...,account manager solution assessment used job c...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes,49,54,"[account manager, https://www.shl.com/solution..."
1,administrative professional short,https://www.shl.com/solutions/products/product...,administrative professional solution entry pos...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes,36,47,"[administrative professional short, https://ww..."
2,agency manager,https://www.shl.com/solutions/products/product...,agency manager solution sale management positi...,Approximate Completion Time in minutes = 51,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,51,48,"[agency manager, https://www.shl.com/solutions..."
3,apprentice job focused assessment,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No,30,29,"[apprentice job focused assessment, https://ww..."
4,apprentice job focused,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No,20,24,"[apprentice job focused, https://www.shl.com/s..."
5,bank administrative assistant short,https://www.shl.com/solutions/products/product...,administrative assistant solution clerical pos...,Approximate Completion Time in minutes = 35,"A, B, K, P, A, B, C, D, E, K, P, S",Yes,No,35,37,"[bank administrative assistant short, https://..."
6,bank collection agent short,https://www.shl.com/solutions/products/product...,solution collection position inbound outbound ...,Approximate Completion Time in minutes = 45,"A, B, P, A, B, C, D, E, K, P, S",Yes,No,45,33,"[bank collection agent short, https://www.shl...."
7,bank operation supervisor short,https://www.shl.com/solutions/products/product...,bank operation supervisor solution job candida...,Approximate Completion Time in minutes = 45,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,45,71,"[bank operation supervisor short, https://www...."
8,bilingual spanish reservation agent,https://www.shl.com/solutions/products/product...,bilingual reservation agent solution designed ...,Approximate Completion Time in minutes = 43,"B, P, S, A, A, B, C, D, E, K, P, S",Yes,Yes,43,52,"[bilingual spanish reservation agent, https://..."
9,bookkeeping accounting auditing clerk short,https://www.shl.com/solutions/products/product...,bookkeeping accounting auditing clerk solution...,Approximate Completion Time in minutes = 49,"P, S, K, B, A, A, B, C, D, E, K, P, S",Yes,Yes,49,48,"[bookkeeping accounting auditing clerk short, ..."


In [121]:
# df.drop_duplicates(subset = ['Assessment Name'], inplace = True)

### Token indices sequence length is longer than the specified maximum sequence length for this model (1189 > 512). 

In [122]:
## We drop some token so that it fits the max. sequence length 

#### Check max.length of each column entry

In [136]:
max_lengths = newdf.select_dtypes(include=['object']).apply(lambda x: x.str.len().max())
max_lengths

Assessment Name                 43
URL                            109
Description                    585
Assessment Duration             43
Test Type                       37
Remote Testing Support           3
Adaptive/IRT Support             3
combined_data_for_knowledge     10
dtype: int64

## Conclusion we only take description limited to possible max.sequence length + duation of each course

In [147]:
# Create a Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(newdf['Description'])

# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # Use L2 = Euclidean distance for similarity search

# Add the encoded embeddings to the FAISS index
index.add(embeddings)

# Define a function to search for similar embeddings
def search_similar(query_text):
    # Encode the query text
    query_embedding = model.encode([query_text])

    # Search for similar embeddings
    D, I = index.search(query_embedding, k=10)  # Return the top 20 most similar embeddings

    # Remove duplicates from the indices array
    I = numpy.unique(I[0])

    # Select the top 10 most similar embeddings
    most_relevant_indices = I[:10]

    return most_relevant_indices



In [148]:
# Test the function
query_text = 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking \
                for an assessment(s) that can be completed in 40 minutes.'
most_similar_indices = search_similar(query_text)

# Get the most similar courses
most_similar_courses = newdf.iloc[most_similar_indices]

# Print the results
(most_similar_courses)



Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support,Assessment Length,Description Length,combined_data_for_knowledge
0,account manager,https://www.shl.com/solutions/products/product...,account manager solution assessment used job c...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes,49,54,"[account manager, https://www.shl.com/solution..."
1,administrative professional short,https://www.shl.com/solutions/products/product...,administrative professional solution entry pos...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes,36,47,"[administrative professional short, https://ww..."
3,apprentice job focused assessment,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No,30,29,"[apprentice job focused assessment, https://ww..."
4,apprentice job focused,https://www.shl.com/solutions/products/product...,apprentice assessment short targeted globally ...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No,20,24,"[apprentice job focused, https://www.shl.com/s..."
5,bank administrative assistant short,https://www.shl.com/solutions/products/product...,administrative assistant solution clerical pos...,Approximate Completion Time in minutes = 35,"A, B, K, P, A, B, C, D, E, K, P, S",Yes,No,35,37,"[bank administrative assistant short, https://..."
6,bank collection agent short,https://www.shl.com/solutions/products/product...,solution collection position inbound outbound ...,Approximate Completion Time in minutes = 45,"A, B, P, A, B, C, D, E, K, P, S",Yes,No,45,33,"[bank collection agent short, https://www.shl...."
7,bank operation supervisor short,https://www.shl.com/solutions/products/product...,bank operation supervisor solution job candida...,Approximate Completion Time in minutes = 45,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes,45,71,"[bank operation supervisor short, https://www...."
8,bilingual spanish reservation agent,https://www.shl.com/solutions/products/product...,bilingual reservation agent solution designed ...,Approximate Completion Time in minutes = 43,"B, P, S, A, A, B, C, D, E, K, P, S",Yes,Yes,43,52,"[bilingual spanish reservation agent, https://..."
9,bookkeeping accounting auditing clerk short,https://www.shl.com/solutions/products/product...,bookkeeping accounting auditing clerk solution...,Approximate Completion Time in minutes = 49,"P, S, K, B, A, A, B, C, D, E, K, P, S",Yes,Yes,49,48,"[bookkeeping accounting auditing clerk short, ..."
10,branch manager short,https://www.shl.com/solutions/products/product...,solution mid financial institution managerial ...,Approximate Completion Time in minutes = 50,"A, B, P, A, B, C, D, E, K, P, S",Yes,No,50,41,"[branch manager short, https://www.shl.com/sol..."


In [144]:
newdf.to_csv("Cleaned_SHL_catalog.csv")

### Trying Google Gemini Emebedding Models