In [18]:
### IMPORTING NECESSARY LIBRARIES
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import one_hot
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
from gensim.models import Word2Vec
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences


In [19]:
### LOADING THE DATASET
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data
y = newsgroups.target
df = pd.DataFrame({'text': X, 'target': y})
df.head()


Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [20]:
### TEXT PREPROCESSING

stemmer = PorterStemmer()

def tokenize_and_stem(text):
    ## Splitting the text into words/tokens
    words = text.split()
    ## Applying stemming to each word
    stemmed_words = [stemmer.stem(word) for word in words]
    ## Joining stemmed words back into a string
    return ' '.join(stemmed_words)

## Applying tokenization and stemming
df['text'] = df['text'].apply(tokenize_and_stem)



In [21]:
df['tokens'] = df['text'].apply(lambda text: text.split())

In [22]:
### IMPLEMENTING WORD2VEC MODEL TO CONVERT TOKEN TO VECTORS
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")  # Save the model for later use


In [23]:
### GETTING THE WORD VECTORS FOR EACH DOCUMENT

def get_word2vec_vectors(tokens_list, model, vector_size=100):
    word_vectors = [model.wv[token] for token in tokens_list if token in model.wv]
    return word_vectors  ## Returning the list of word vectors for each document

X_word2vec_raw = [get_word2vec_vectors(tokens, word2vec_model) for tokens in df['tokens']]



In [24]:
### PADDING IT TO MAKE IT EQUAL LENGTH

max_length = 200
X_word2vec_padded = pad_sequences([np.array(doc_vectors) for doc_vectors in X_word2vec_raw],
                                  maxlen=max_length,
                                  dtype='float32',
                                  padding='post',
                                  truncating='post')

X_word2vec_flattened = X_word2vec_padded.reshape(X_word2vec_padded.shape[0], -1)  ## Flattening to (N, 200 * 100)


In [25]:
### TRAIN TEST SPLITTING
X_train_word2vec, X_test_word2vec, y_train, y_test = train_test_split(X_word2vec_flattened, y, test_size=0.2, random_state=42)


In [26]:
### MODEL BUILDING AND COMPILING
def build_ann_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))  # Dropout to prevent overfitting
    model.add(Dense(64, activation='relu'))
    model.add(Dense(20, activation='softmax'))  # 20 classes for the newsgroup categories
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [27]:
### MODEL TRAINING
model_word2vec = build_ann_model(X_train_word2vec.shape[1])  ## input_dim should match the length of Word2Vec vectors
history_word2vec = model_word2vec.fit(X_train_word2vec, y_train, epochs=10, batch_size=64, validation_data=(X_test_word2vec, y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 46ms/step - accuracy: 0.0614 - loss: 3.1068 - val_accuracy: 0.0942 - val_loss: 2.8460
Epoch 2/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 39ms/step - accuracy: 0.0943 - loss: 2.8592 - val_accuracy: 0.1347 - val_loss: 2.7079
Epoch 3/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 42ms/step - accuracy: 0.1181 - loss: 2.7482 - val_accuracy: 0.1408 - val_loss: 2.6525
Epoch 4/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 44ms/step - accuracy: 0.1309 - loss: 2.6906 - val_accuracy: 0.1454 - val_loss: 2.6229
Epoch 5/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 45ms/step - accuracy: 0.1428 - loss: 2.6478 - val_accuracy: 0.1788 - val_loss: 2.5730
Epoch 6/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 41ms/step - accuracy: 0.1547 - loss: 2.6083 - val_accuracy: 0.1560 - val_loss: 2.5658
Epoch 7/10
[1m2

In [28]:
### MODEL EVALUATION
loss_word2vec, accuracy_word2vec = model_word2vec.evaluate(X_test_word2vec, y_test)
print(f'Word2Vec Approach Accuracy: {accuracy_word2vec:.4f}')

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.1793 - loss: 2.4992
Word2Vec Approach Accuracy: 0.1817


In [29]:
### CREATING THE AVERAGE WORD2VEC FEATURE VECTORS FOR EACH DOCUMENTS

def get_average_word2vec(tokens_list, model, vector_size=100):
    word_vectors = [model.wv[token] for token in tokens_list if token in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)

X_average_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in df['tokens']])

In [30]:
### TRAIN TEST SPLITTING
X_train_avg_word2vec, X_test_avg_word2vec, y_train, y_test = train_test_split(X_average_word2vec, y, test_size=0.2, random_state=42)


In [31]:
### MODEL BUILDING AND COMPILING

def build_ann_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))  # Dropout to prevent overfitting
    model.add(Dense(64, activation='relu'))
    model.add(Dense(20, activation='softmax'))  # 20 classes for the newsgroup categories
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [32]:
### MODEL TRAINING
model_avg_word2vec = build_ann_model(X_train_avg_word2vec.shape[1])  # input_dim should match the length of Average Word2Vec vectors
history_avg_word2vec = model_avg_word2vec.fit(X_train_avg_word2vec, y_train, epochs=10, batch_size=64, validation_data=(X_test_avg_word2vec, y_test))

Epoch 1/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.1039 - loss: 2.8872 - val_accuracy: 0.2584 - val_loss: 2.3201
Epoch 2/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2196 - loss: 2.3855 - val_accuracy: 0.2902 - val_loss: 2.1853
Epoch 3/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2567 - loss: 2.2728 - val_accuracy: 0.3109 - val_loss: 2.1238
Epoch 4/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2847 - loss: 2.1841 - val_accuracy: 0.3263 - val_loss: 2.0862
Epoch 5/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3012 - loss: 2.1524 - val_accuracy: 0.3332 - val_loss: 2.0699
Epoch 6/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2986 - loss: 2.1382 - val_accuracy: 0.3443 - val_loss: 2.0397
Epoch 7/10
[1m236/236[0m 

In [33]:
### MODEL EVALUATION
loss_avg_word2vec, accuracy_avg_word2vec = model_avg_word2vec.evaluate(X_test_avg_word2vec, y_test)
print(f'Average Word2Vec Approach Accuracy: {accuracy_avg_word2vec:.4f}')

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3552 - loss: 1.9937
Average Word2Vec Approach Accuracy: 0.3650
