In [27]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources (uncomment these lines if not downloaded yet)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

# Example usage
user_input = input("Enter a sentence: ")
processed_tokens = preprocess_text(user_input)
print("Processed Tokens:", processed_tokens)


Processed Tokens: ['want', 'build', 'websit', 'buy', 'coffe']


In [30]:
# Simulating data collection by creating a dataset of HTML and CSS codes.

html_css_dataset = [
    {
        'html': '''
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <meta charset="UTF-8">
                <meta name="viewport" content="width=device-width, initial-scale=1.0">
                <title>Awesome Coffee Shop</title>
                <link rel="stylesheet" href="styles.css">
                <style>
                    body {
                        font-family: 'Arial', sans-serif;
                        margin: 0;
                        padding: 0;
                        background-color: #f8f8f8;
                    }

                    header {
                        background-color: #333;
                        color: white;
                        text-align: center;
                        padding: 1em;
                    }

                    main {
                        padding: 2em;
                    }

                    h1 {
                        color: #333;
                    }

                    p {
                        color: #555;
                    }

                    ul {
                        list-style-type: none;
                        padding: 0;
                    }

                    ul li {
                        margin-bottom: 0.5em;
                    }

                    img {
                        max-width: 100%;
                        height: auto;
                    }
                </style>
            </head>
            <body>

                <header>
                    <h1>Awesome Coffee Shop</h1>
                </header>

                <main>
                    <h2>Welcome to Our Cozy Coffee Shop</h2>
                    <p>Indulge in the rich aroma and flavors of our premium coffee. We offer a relaxing environment for you to enjoy your favorite brews.</p>

                    <h2>Our Menu</h2>
                    <p>Explore our diverse menu of coffee, tea, and delicious pastries.</p>
                    <ul>
                        <li>Espresso</li>
                        <li>Cappuccino</li>
                        <li>Latte</li>
                        <li>Americano</li>
                        <li>Chai Tea</li>
                        <li>Assorted Pastries</li>
                    </ul>

                    <h2>Visit Us Today!</h2>
                    <p>We are located at:</p>
                    <address>
                        123 Coffee Street, Cityville, CO 12345
                    </address>

                    <h2>Follow Us</h2>
                    <p>Stay connected with us on social media:</p>
                    <a href="https://www.facebook.com/awesomecoffeeshop" target="_blank">Facebook</a> |
                    <a href="https://twitter.com/awesomecoffee" target="_blank">Twitter</a> |
                    <a href="https://www.instagram.com/awesomecoffeeshop" target="_blank">Instagram</a>

                    <h2>Gallery</h2>
                    <img src="coffee-shop-interior.jpg" alt="Coffee Shop Interior">

                </main>

            </body>
            </html>
        ''',
        'css': '''
            body {
                font-family: 'Arial', sans-serif;
                margin: 0;
                padding: 0;
                background-color: #f8f8f8;
            }

            header {
                background-color: #333;
                color: white;
                text-align: center;
                padding: 1em;
            }

            main {
                padding: 2em;
            }

            h1 {
                color: #333;
            }

            p {
                color: #555;
            }

            ul {
                list-style-type: none;
                padding: 0;
            }

            ul li {
                margin-bottom: 0.5em;
            }

            img {
                max-width: 100%;
                height: auto;
            }
        ''',
        'text_content':'',
        'images':'',
        'links':''
    },
    {
        'html': '''
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <meta charset="UTF-8">
                <meta name="viewport" content="width=device-width, initial-scale=1.0">
                <title>Sporty Gear Shop</title>
                <link rel="stylesheet" href="styles.css">
            </head>
            <body>

            <header>
                <h1>Sporty Gear Shop</h1>
            </header>

            <main>
                <h2>Welcome to Our Sports Gear Shop</h2>
                <p>Discover a wide range of high-quality sports gear for all your athletic needs.</p>

                <h2>Our Product Categories</h2>
                <ul>
                    <li>Athletic Shoes</li>
                    <li>Workout Apparel</li>
                    <li>Training Equipment</li>
                    <li>Sports Accessories</li>
                </ul>

            <h2>Visit Us Today!</h2>
            <p>We are located at:</p>
            <address>
                456 Sports Avenue, Fit City, CA 56789
            </address>

                <h2>Follow Us</h2>
                <p>Stay connected with us on social media:</p>
                <a href="https://www.facebook.com/sportygearshop" target="_blank">Facebook</a> |
                <a href="https://twitter.com/sportygear" target="_blank">Twitter</a> |
                <a href="https://www.instagram.com/sportygearshop" target="_blank">Instagram</a>

                <h2>Gallery</h2>
                <img src="sports-shop-interior.jpg" alt="Sports Shop Interior">

            </main>

            </body>
            </html>
        ''',
        'css': '''
            body {
                font-family: 'Arial', sans-serif;
                margin: 0;
                padding: 0;
                background-color: #f0f0f0;
            }

            header {
                background-color: #2c3e50;
                color: white;
                text-align: center;
                padding: 1em;
            }

            main {
                padding: 2em;
            }

            h1 {
                color: #2c3e50;
            }

            p {
                color: #555;
            }

            ul {
                list-style-type: none;
                padding: 0;
            }

            ul li {
                margin-bottom: 0.5em;
            }

            img {
                max-width: 100%;
                height: auto;
            }
        ''',
        'text_content':'',
        'images':'',
        'links':''
    },
    # Add more entries as needed
]
#text content, images, and links.
# Display the first entry in the dataset
# print("HTML Code:")
# print(html_css_dataset[0]['html'])
# print("\nCSS Code:")
# print(html_css_dataset[0]['css'])


In [33]:
from bs4 import BeautifulSoup

def clean_html(html):
    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Remove script and style tags
    for tag in soup(['script', 'style']):
        tag.decompose()

    # Get text content
    cleaned_html = soup.get_text(separator=' ')

    return cleaned_html

def clean_css(css):
    # Remove comments
    css = css.replace('/*', '').replace('*/', '')

    # Remove unnecessary spaces
    css = ' '.join(css.split())

    return css

# Example HTML and CSS data
html_data = html_css_dataset[0]['html']
css_data = html_css_dataset[0]['css']

# # Clean and preprocess HTML
# cleaned_html = clean_html(html_data)
# print("Cleaned HTML:")
# print(cleaned_html)

# # Clean and preprocess CSS
# cleaned_css = clean_css(css_data)
# print("\nCleaned CSS:")
# print(cleaned_css)


In [34]:
from bs4 import BeautifulSoup, Comment
import cssutils

def extract_features(html, css):
    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Extract relevant features from HTML
    html_features = {
        # ... (unchanged)
         'num_tags': len(soup.find_all()),
        'num_links': len(soup.find_all('a')),
        'num_paragraphs': len(soup.find_all('p')),
        'num_images': len(soup.find_all('img')),
        'num_headings': len(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])),
        'num_lists': len(soup.find_all(['ul', 'ol', 'dl'])),
        'num_forms': len(soup.find_all('form')),
        'num_inputs': len(soup.find_all('input')),
        'num_buttons': len(soup.find_all('button')),
        'num_tables': len(soup.find_all('table')),
        'num_divs': len(soup.find_all('div')),
        'num_spans': len(soup.find_all('span')),
        'num_classes': len(set([cls for tag in soup.find_all() for cls in tag.get('class', [])])),
        'num_ids': len(set([tag.get('id') for tag in soup.find_all()])),
        'has_header': bool(soup.find('header')),
        'has_footer': bool(soup.find('footer')),
        'has_nav': bool(soup.find('nav')),
        'has_inline_styles': bool(soup.find(style=True)),
        'has_script': bool(soup.find('script')),
        'has_meta_tags': bool(soup.find('meta')),
        'has_comments': bool(soup.find_all(string=lambda text: isinstance(text, Comment))),
        'has_svg': bool(soup.find('svg')),
        'has_canvas': bool(soup.find('canvas')),
        'has_audio': bool(soup.find('audio')),
        'has_video': bool(soup.find('video')),
    }
    # Parse CSS using cssutils
    css_parser = cssutils.CSSParser()
    css_sheet = css_parser.parseString(css)

    # Extract relevant features from CSS
    num_styles = 0
    num_rules = 0
    unique_properties = set()
    font_styles = set()
    color_styles = set()

    for rule in css_sheet:
        if isinstance(rule, cssutils.css.CSSStyleRule):
            num_rules += 1
            num_styles += rule.style.length  # Use the length attribute to get the number of styles
            unique_properties.update(rule.style.keys())
            font_styles.update(set(['font-family', 'font-size', 'font-weight', 'font-style']) & set(rule.style.keys()))
            color_styles.update(set(['color', 'background-color']) & set(rule.style.keys()))

    css_features = {
        'num_styles': num_styles,
        'num_rules': num_rules,
        'unique_properties': len(unique_properties),
        'font_styles': len(font_styles),
        'color_styles': len(color_styles),
        'has_keyframes': any('@keyframes' in rule.cssText for rule in css_sheet),
        'has_web_fonts': bool(soup.find('link', {'rel': 'stylesheet', 'type': 'font/woff2'})),
        'has_print_styles': '@media print' in css,
        'has_transformations': 'transform' in unique_properties,
        'has_gradients': any('gradient' in prop for prop in unique_properties),
        'has_fixed_positioning': 'position: fixed' in css,
        'has_overflow_property': 'overflow' in unique_properties,
        'has_animations': 'animation' in unique_properties,
        # Add more features as needed
    }

    return html_features, css_features

# Example usage
html_code = '<html><body><p>Example HTML</p><a href="#">Link</a></body></html>'
css_code = 'body { font-size: 16px; color: #333; } a { text-decoration: none; }'
html_features, css_features = extract_features(html_data, css_data)

print("HTML Features:", html_features)
print("CSS Features:", css_features)


HTML Features: {'num_tags': 32, 'num_links': 3, 'num_paragraphs': 4, 'num_images': 1, 'num_headings': 6, 'num_lists': 1, 'num_forms': 0, 'num_inputs': 0, 'num_buttons': 0, 'num_tables': 0, 'num_divs': 0, 'num_spans': 0, 'num_classes': 0, 'num_ids': 1, 'has_header': True, 'has_footer': False, 'has_nav': False, 'has_inline_styles': False, 'has_script': False, 'has_meta_tags': True, 'has_comments': False, 'has_svg': False, 'has_canvas': False, 'has_audio': False, 'has_video': False}
CSS Features: {'num_styles': 16, 'num_rules': 8, 'unique_properties': 10, 'font_styles': 1, 'color_styles': 2, 'has_keyframes': False, 'has_web_fonts': False, 'has_print_styles': False, 'has_transformations': False, 'has_gradients': False, 'has_fixed_positioning': False, 'has_overflow_property': False, 'has_animations': False}


In [28]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense


# Convert tokens to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_tokens)
sequences = tokenizer.texts_to_sequences(processed_tokens)

# Pad sequences to make them uniform in length
padded_sequences = pad_sequences([sequences], maxlen=MAX_SEQUENCE_LENGTH)

# Assuming MAX_SEQUENCE_LENGTH is the maximum length of sequences expected by your model

# Define and compile your LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(128))
model.add(Dense(OUTPUT_DIM, activation='softmax'))

# Compile the model (choose appropriate loss and optimizer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Use the model to generate predictions
predictions = model.predict(padded_sequences)

# Decode the predictions back to words
decoded_predictions = [list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(idx)] for idx in np.argmax(predictions, axis=-1)[0]]

# Print the decoded predictions
print("Decoded Predictions:", decoded_predictions)


ModuleNotFoundError: No module named 'keras'

In [29]:
def postprocess_predictions(predictions):
    # Example: Assume predictions are probabilities, and you want to threshold them for binary classification
    threshold = 0.5
    binary_predictions = (predictions > threshold).astype(int)

    # Example: Convert binary predictions to HTML and CSS code
    html_code = "<html>\n<head>\n</head>\n<body>\n"
    css_code = "body {\n"

    if binary_predictions[0] == 1:
        # Include specific HTML and CSS elements based on the model's prediction
        html_code += "<h1>This is a relevant page</h1>\n"
        css_code += "    background-color: #f0f0f0;\n"
    else:
        html_code += "<p>This is not a relevant page</p>\n"
        css_code += "    background-color: #ffffff;\n"

    html_code += "</body>\n</html>"
    css_code += "}\n"

    return html_code, css_code


In [3]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

data = np.array([
    [0.123, 0.456, 0.789, 0.987, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456],
    [0.111, 0.222, 0.333, 0.444, 0.555],
    # Add more vectors as needed
    [0.222, 0.444, 0.666, 0.888, 0.999],
    [0.555, 0.777, 0.111, 0.333, 0.666],
    [0.999, 0.333, 0.111, 0.555, 0.777],
    [0.333, 0.111, 0.888, 0.444, 0.222],
    [0.666, 0.999, 0.444, 0.777, 0.555],
    [0.222, 0.777, 0.555, 0.888, 0.333],
    [0.888, 0.555, 0.222, 0.999, 0.666],
    [0.444, 0.222, 0.999, 0.333, 0.111],
    [0.111, 0.666, 0.777, 0.888, 0.999],
    # Add more vectors as needed
])

# Input vector for which you want to find the closest neighbors
input_vector = np.array([0.567, 0.876, 0.234, 0.543, 0.789])

# Number of neighbors to retrieve
k = 5

# Create and fit the KNN model
knn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')  # You can use 'cosine' for cosine similarity
knn_model.fit(data)

# Find the closest neighbors for the input vector
distances, indices = knn_model.kneighbors([input_vector])

# Display the top k closest vectors
for i in range(k):
    print(f"Vector: {data[indices[0][i]]}, Distance: {distances[0][i]}")


Vector: [0.555 0.777 0.111 0.333 0.666], Distance: 0.29034978904762443
Vector: [0.666 0.999 0.444 0.777 0.555], Distance: 0.4225423055742466
Vector: [0.888 0.555 0.222 0.999 0.666], Distance: 0.6552030219710528
Vector: [0.999 0.333 0.111 0.555 0.777], Distance: 0.7049042488168161
Vector: [0.987 0.654 0.321 0.123 0.456], Distance: 0.7214859665994897


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

data = np.array([
    [0.123, 0.456, 0.789, 0.987, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456],
    [0.111, 0.222, 0.333, 0.444, 0.555],
    # Add more vectors as needed
    [0.222, 0.444, 0.666, 0.888, 0.999],
    [0.555, 0.777, 0.111, 0.333, 0.666],
    [0.999, 0.333, 0.111, 0.555, 0.777],
    [0.333, 0.111, 0.888, 0.444, 0.222],
    [0.666, 0.999, 0.444, 0.777, 0.555],
    [0.222, 0.777, 0.555, 0.888, 0.333],
    [0.888, 0.555, 0.222, 0.999, 0.666],
    [0.444, 0.222, 0.999, 0.333, 0.111],
    [0.111, 0.666, 0.777, 0.888, 0.999],
    # Add more vectors as needed
])

# Input vector for which you want to find the closest neighbors
input_vector = np.array([0.567, 0.876, 0.234, 0.543, 0.789])


# Number of neighbors to retrieve
k = 5

# Calculate cosine similarity
similarity_scores = cosine_similarity([input_vector], data)

# Get indices of the top k similar vectors
top_k_indices = np.argsort(similarity_scores[0])[-k:][::-1]

# Display the top k closest vectors
for i in top_k_indices:
    print(f"Vector: {data[i]}, Cosine Similarity: {similarity_scores[0][i]}")


Vector: [0.555 0.777 0.111 0.333 0.666], Cosine Similarity: 0.9896748899148851
Vector: [0.666 0.999 0.444 0.777 0.555], Cosine Similarity: 0.9667436934325361
Vector: [0.888 0.555 0.222 0.999 0.666], Cosine Similarity: 0.9135244571918312
Vector: [0.999 0.333 0.111 0.555 0.777], Cosine Similarity: 0.8786489861497001
Vector: [0.111 0.666 0.777 0.888 0.999], Cosine Similarity: 0.86647416051087
