HTML Tags Classifier

In [25]:
import pandas as pd
import numpy as np


In [62]:
df=pd.read_csv("/content/dataset.csv")

In [63]:
df.head()

Unnamed: 0,tag,category,context
0,nav,sectioning,<nav>This is inside the <nav> tag.</nav>
1,canvas,scripting,<canvas>This is inside the <canvas> tag.</canvas>
2,noscript,scripting,<noscript>This is inside the <noscript> tag.</...
3,ol,list,<ol>This is inside the <ol> tag.</ol>
4,th,table,<th>This is inside the <th> tag.</th>


In [64]:
df.isnull().sum()

Unnamed: 0,0
tag,0
category,0
context,0


In [65]:
df.shape
df.columns

Index(['tag', 'category', 'context'], dtype='object')

In [66]:
df.drop(columns=["category"],inplace=True)

In [67]:
df.head()

Unnamed: 0,tag,context
0,nav,<nav>This is inside the <nav> tag.</nav>
1,canvas,<canvas>This is inside the <canvas> tag.</canvas>
2,noscript,<noscript>This is inside the <noscript> tag.</...
3,ol,<ol>This is inside the <ol> tag.</ol>
4,th,<th>This is inside the <th> tag.</th>


In [68]:
df["tag"].value_counts()

Unnamed: 0_level_0,count
tag,Unnamed: 1_level_1
link,483
nav,475
main,474
option,473
code,473
...,...
template,408
blockquote,408
legend,403
img,400


In [69]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

df["tag"]=le.fit_transform(df["tag"])
mapping=dict(zip(le.classes_,range(len(le.classes_))))

print(mapping)

{'a': 0, 'abbr': 1, 'article': 2, 'aside': 3, 'audio': 4, 'base': 5, 'blockquote': 6, 'br': 7, 'button': 8, 'canvas': 9, 'caption': 10, 'cite': 11, 'code': 12, 'col': 13, 'colgroup': 14, 'dd': 15, 'dl': 16, 'dt': 17, 'em': 18, 'embed': 19, 'fieldset': 20, 'footer': 21, 'form': 22, 'h1': 23, 'h2': 24, 'h3': 25, 'h4': 26, 'h5': 27, 'h6': 28, 'header': 29, 'iframe': 30, 'img': 31, 'input': 32, 'label': 33, 'legend': 34, 'li': 35, 'link': 36, 'main': 37, 'mark': 38, 'meta': 39, 'nav': 40, 'noscript': 41, 'object': 42, 'ol': 43, 'option': 44, 'p': 45, 'pre': 46, 'script': 47, 'section': 48, 'select': 49, 'slot': 50, 'source': 51, 'span': 52, 'strong': 53, 'style': 54, 'table': 55, 'tbody': 56, 'td': 57, 'template': 58, 'textarea': 59, 'tfoot': 60, 'th': 61, 'thead': 62, 'title': 63, 'tr': 64, 'track': 65, 'ul': 66, 'video': 67}


In [34]:
df.head()

Unnamed: 0,tag,context
0,40,<nav>This is inside the <nav> tag.</nav>
1,9,<canvas>This is inside the <canvas> tag.</canvas>
2,41,<noscript>This is inside the <noscript> tag.</...
3,43,<ol>This is inside the <ol> tag.</ol>
4,61,<th>This is inside the <th> tag.</th>


Converting input into Vectors

In [35]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences

#tokenizing
tokens=df["context"].apply(simple_preprocess)

#training word2vec
w2v_model=Word2Vec(tokens,vector_size=100,window=5,min_count=1)
w2v_model.build_vocab(tokens)
w2v_model.train(tokens,total_examples=len(tokens),epochs=10)

#build word_index
word_index = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

# convertin sentences to sequence of index
sequences = [[word_index.get(word, 0) for word in sentence] for sentence in tokens]


# Step 5: Pad sequences
padded_sequences = pad_sequences(sequences, padding='post')

import pickle

# Save the model (Word2Vec) as a pickle file
with open('word2vec_model.pkl', 'wb') as file:
    pickle.dump(w2v_model, file)






In [36]:
max_len=len(padded_sequences[0])

In [37]:
#preparing embeddin matrix

import numpy as np

vocab_size = len(word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]


In [51]:
# here ->
y=df["tag"]
x=padded_sequences

Now creating our model

In [52]:
num_classes=len(mapping)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,GlobalAveragePooling1D

model=Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=padded_sequences.shape[1],
                    trainable=False))
model.add(GlobalAveragePooling1D())

model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(num_classes, activation="softmax"))

In [53]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

In [55]:

model.fit(x, y, epochs=5, batch_size=32, validation_split=0.1)


Epoch 1/5
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9072 - loss: 0.2243 - val_accuracy: 0.9047 - val_loss: 0.2326
Epoch 2/5
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9089 - loss: 0.2187 - val_accuracy: 0.9047 - val_loss: 0.2296
Epoch 3/5
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9090 - loss: 0.2189 - val_accuracy: 0.9067 - val_loss: 0.2286
Epoch 4/5
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9051 - loss: 0.2238 - val_accuracy: 0.9067 - val_loss: 0.2310
Epoch 5/5
[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9059 - loss: 0.2227 - val_accuracy: 0.9047 - val_loss: 0.2309


<keras.src.callbacks.history.History at 0x7a7839507c50>

In [57]:
#saving word index
with open("word_index.pkl", "wb") as f:
    pickle.dump(word_index, f)


In [56]:
#showing example


In [71]:
text="<canvas> some canvas</canvas>"
# Step 1 -> Tokenize
tokens = simple_preprocess(text)

w2v_model = Word2Vec.load("word2vec_model.pkl")
with open("word_index.pkl", "rb") as f:
    word_index = pickle.load(f)

#step 2 -> converting into padded_sequence
sequence = [word_index.get(word, 0) for word in tokens]
padded = pad_sequences([sequence], maxlen=max_len, padding='post')  # use same maxlen as training

index_to_label = {v: k for k, v in index_to_label.items()}

prediction = model.predict(padded)
predicted_index = np.argmax(prediction)
predicted_label = index_to_label[predicted_index]

print("Predicted Label:", predicted_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Predicted Label: canvas
