In [None]:
pip install tensorflow gensim scikit-learn



In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
categories = ['rec.sport.baseball', 'sci.space', 'comp.graphics', 'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
texts = newsgroups.data
labels = newsgroups.target

In [None]:
le=LabelEncoder()
y=le.fit_transform(labels)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)

In [None]:
max_length = 200
X = pad_sequences(X, maxlen=max_length)

In [None]:
sentences = [text.split() for text in texts]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length,
                    weights=[embedding_matrix], trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(categories), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(X, y, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 333ms/step - accuracy: 0.2980 - loss: 1.3727 - val_accuracy: 0.4403 - val_loss: 1.2454
Epoch 2/5
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 312ms/step - accuracy: 0.4394 - loss: 1.2364 - val_accuracy: 0.4788 - val_loss: 1.1676
Epoch 3/5
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 314ms/step - accuracy: 0.5183 - loss: 1.1150 - val_accuracy: 0.5610 - val_loss: 1.0307
Epoch 4/5
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 305ms/step - accuracy: 0.5588 - loss: 1.0474 - val_accuracy: 0.5956 - val_loss: 0.9678
Epoch 5/5
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 316ms/step - accuracy: 0.5788 - loss: 0.9830 - val_accuracy: 0.6175 - val_loss: 0.9066


In [None]:
model.summary()

In [None]:
loss, accuracy = model.evaluate(X, y)
print("Loss:", loss)
print("Accuracy:", accuracy)

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 54ms/step - accuracy: 0.6593 - loss: 0.8252
Loss: 0.8480328917503357
Accuracy: 0.6535695791244507
