In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import gensim.downloader as api

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

def preprocess_texts(texts):
    return [preprocess_text(text) for text in texts]


word2vec_model = api.load("word2vec-google-news-300")

def text_to_embedding(texts, model, tfidf_vectorizer):
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(text) for text in texts])
    feature_names = tfidf_vectorizer.get_feature_names_out()

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    embeddings = []
    for text in texts:
        text_embedding = np.zeros(model.vector_size)
        word_count = 0
        for word in text:
            if word in feature_names:
                tfidf_value = tfidf_vectorizer.transform([' '.join([word])]).data[0]
                word_embedding = get_word_vector(word) * tfidf_value
                text_embedding += word_embedding
                word_count += 1
        if word_count > 0:
            text_embedding /= word_count
        embeddings.append(text_embedding)
    return np.array(embeddings)

def cluster_descriptions(embeddings, num_clusters=2):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    clusters = kmeans.fit_predict(embeddings)
    return kmeans, clusters

def classify_user_input(user_input, word2vec_model, tfidf_vectorizer, kmeans):
    preprocessed_input = preprocess_text(user_input)
    input_embedding = text_to_embedding([preprocessed_input], word2vec_model, tfidf_vectorizer)[0]
    cluster = kmeans.predict([input_embedding])[0]
    return cluster

def compute_similarity(embeddings, query_embedding):
    similarities = cosine_similarity([query_embedding], embeddings).flatten()
    return similarities

def retrieve_top_k_websites(user_input, descriptions, description_to_code_files, word2vec_model, tfidf_vectorizer, kmeans, clusters, k=5):
    preprocessed_descriptions = preprocess_texts(descriptions)
    embeddings = text_to_embedding(preprocessed_descriptions, word2vec_model, tfidf_vectorizer)

    design_descriptions = [descriptions[i] for i in range(len(clusters)) if clusters[i] == 0]
    content_descriptions = [descriptions[i] for i in range(len(clusters)) if clusters[i] == 1]

    user_cluster = classify_user_input(user_input, word2vec_model, tfidf_vectorizer, kmeans)

    if user_cluster == 0:
        relevant_descriptions = design_descriptions
    else:
        relevant_descriptions = content_descriptions

    preprocessed_input = preprocess_text(user_input)
    input_embedding = text_to_embedding([preprocessed_input], word2vec_model, tfidf_vectorizer)[0]
    similarities = compute_similarity(embeddings, input_embedding)

    top_k_indices = similarities.argsort()[-k:][::-1]
    top_k_code_files = [description_to_code_files[descriptions[i]] for i in top_k_indices if descriptions[i] in relevant_descriptions]

    return top_k_code_files[:k]

descriptions = [
    "This website is an e-commerce platform for selling computers. It has a modern design with a dark theme.",
    "A blog about travel and adventure experiences around the world. The design features vibrant colors and a responsive layout.",
    "A portfolio website showcasing graphic design and photography work. It uses a minimalist design with a focus on visual content.",
    "An online platform for learning and practicing programming languages. The website has a clean design with easy navigation and code examples.",
    "A news website providing the latest updates on technology and science. It has a professional design with a structured layout and news sections."
]

code_file_paths = [
    "path/to/website1",
    "path/to/website2",
    "path/to/website3",
    "path/to/website4",
    "path/to/website5"
]

description_to_code_files = {desc: path for desc, path in zip(descriptions, code_file_paths)}

preprocessed_descriptions = preprocess_texts(descriptions)

tfidf_vectorizer = TfidfVectorizer()
embeddings = text_to_embedding(preprocessed_descriptions, word2vec_model, tfidf_vectorizer)

kmeans, clusters = cluster_descriptions(embeddings)

user_input = "A platform to buy the latest gadgets and electronic devices. It should have a modern design with interactive elements."

top_k_code_files = retrieve_top_k_websites(user_input, descriptions, description_to_code_files, word2vec_model, tfidf_vectorizer, kmeans, clusters, k=3)

print("Top K Code Files:")
for code_file in top_k_code_files:
    print(code_file)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.






Top K Code Files:
path/to/website1
path/to/website4
path/to/website5
