In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize and lemmatize the text
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token.isalnum() and token not in stop_words]

    # Join tokens back into a string
    processed_text = ' '.join(filtered_tokens)
    return processed_text

def calculate_cosine_similarity(candidate_sentences, target_sentence):
    # Preprocess target sentence
    processed_target = preprocess_text(target_sentence)

    # Preprocess and vectorize candidate sentences
    processed_candidates = [preprocess_text(sentence) for sentence in candidate_sentences]
    vectorizer = CountVectorizer().fit_transform([processed_target] + processed_candidates)

    # Calculate cosine similarity
    similarities = cosine_similarity(vectorizer)

    # Extract similarity scores for candidate sentences
    sentence_scores = similarities[1:]

    return sentence_scores



dataset_df = pd.read_csv(r'C:\Users\shanj\Downloads\potentialtalents.csv')
dataset_column = dataset_df['job_title'].tolist()

target_sentences = ["Aspiring human resources", "Seeking human resources"]

for target_sentence in target_sentences:
    print(f"\nTarget Sentence: {target_sentence}\n")

    sentence_scores = calculate_cosine_similarity(dataset_column, target_sentence)

    # Print the similarity scores for each sentence
    for i, (sentence, score) in enumerate(zip(dataset_column, sentence_scores), start=1):
        print(f"Sentence {i}: {sentence} (Cosine Similarity Score: {score[0]:.4f})")
    



Target Sentence: Aspiring human resources

Sentence 1: 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional (Cosine Similarity Score: 0.5000)
Sentence 2: Native English Teacher at EPIK (English Program in Korea) (Cosine Similarity Score: 0.0000)
Sentence 3: Aspiring Human Resources Professional (Cosine Similarity Score: 0.8660)
Sentence 4: People Development Coordinator at Ryan (Cosine Similarity Score: 0.0000)
Sentence 5: Advisory Board Member at Celal Bayar University (Cosine Similarity Score: 0.0000)
Sentence 6: Aspiring Human Resources Specialist (Cosine Similarity Score: 0.8660)
Sentence 7: Student at Humber College and Aspiring Human Resources Generalist (Cosine Similarity Score: 0.6547)
Sentence 8: HR Senior Specialist (Cosine Similarity Score: 0.0000)
Sentence 9: Student at Humber College and Aspiring Human Resources Generalist (Cosine Similarity Score: 0.6547)
Sentence 10: Seeking Human Resources HRIS and Generalist Position

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shanj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shanj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shanj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
dataset_df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,
