In [7]:
import os
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

def preprocess_texts(texts):
    return [preprocess_text(text) for text in texts]

def text_to_embedding(texts, model, tfidf_vectorizer):
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    embeddings = []
    for text in texts:
        text_embedding = np.zeros(model.vector_size)
        word_count = 0
        for word in text.split():
            if word in feature_names:
                tfidf_value = tfidf_vectorizer.transform([word]).data[0]
                word_embedding = get_word_vector(word) * tfidf_value
                text_embedding += word_embedding
                word_count += 1
        if word_count > 0:
            text_embedding /= word_count
        embeddings.append(text_embedding)
    return np.array(embeddings)

def compute_similarity(embeddings, query_embedding):
    similarities = cosine_similarity([query_embedding], embeddings).flatten()
    return similarities

def collect_descriptions_from_folder(folder_path):
    descriptions = []
    file_to_folder = {}
    desc_folder = os.path.join(folder_path, 'desc')
    pages_folder = os.path.join(folder_path, 'pages')

    for filename in os.listdir(desc_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(desc_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                descriptions.append(content)
                file_to_folder[content] = os.path.join(pages_folder, filename[:-4])
    
    return descriptions, file_to_folder

def retrieve_top_k_websites(folder_path, user_input, word2vec_model, tfidf_vectorizer, k=5):
    descriptions, file_to_folder = collect_descriptions_from_folder(folder_path)
    preprocessed_descriptions = preprocess_texts(descriptions)
    embeddings = text_to_embedding(preprocessed_descriptions, word2vec_model, tfidf_vectorizer)
    preprocessed_input = preprocess_text(user_input) 
    input_embedding = text_to_embedding([preprocessed_input], word2vec_model, tfidf_vectorizer)[0]
    similarities = compute_similarity(embeddings, input_embedding)
    top_k_indices = similarities.argsort()[-k:][::-1]
    top_k_folders = [file_to_folder[descriptions[i]] for i in top_k_indices]

    return top_k_folders[:k]

folder_path = folders[0]
user_input = "Enroll in our advanced data science course today."

import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

vectorizer = TfidfVectorizer()

top_k_folders = retrieve_top_k_websites(folder_path, user_input, word2vec_model, vectorizer, k=3)
print(f"\nTop K Website Folders based on user input:")
for folder in top_k_folders:
    print(folder)



Top K Website Folders based on user input:
F:/GP/topic2_courses\pages\9
F:/GP/topic2_courses\pages\8
F:/GP/topic2_courses\pages\5
